Consolidated slab/rr-explicit level-3 macrokernels.

Details: - Consolidated the *sl.c and *rr.c level-3 macrokernels into a single file per sl/rr pair, with those files named as they were before c92762e. The consolidation does not take away the *option* of using slab or round-robin assignment of micropanels to threads; it merely *hides* the choice within the definitions of functions such as bli_thread_range_jrir(), bli_packm_my_iter(), and bli_is_last_iter() rather than expose that choice explicitly in the code. The choice of slab or rr is not always hidden, however; there are some cases involving herk and trmm, for example, that require some part of the computation to use rr unconditionally. (The --thread-part-jrir option controls the partitioning in all other cases.) - Note: Originally, the sl and rr macrokernels were separated out for clarity. However, aside from the additional binary code bloat, I later deemed that clarity not worth the price of maintaining the additional (mostly similar) codes.
2026-04-20 15:48:50 +00:00 · 2018-10-17 14:11:01 -05:00
parent 57eab3a4f0
commit 71c5832d5f
53 changed files with 6127 additions and 1807 deletions
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -35,6 +35,72 @@

 #include "blis.h"

+#define FUNCPTR_T packm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       pack_t  schema,
+       bool_t  invdiag,
+       bool_t  revifup,
+       bool_t  reviflo,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_max,
+       dim_t   n_max,
+       void*   kappa,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  inc_t is_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   packm_ker,
+       cntx_t* cntx,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
+
+
+static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
+{
+                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
+// 0000 row/col panels
+               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
+                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
+// 0001 row/col panels: 4m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_4mi,
+                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
+// 0010 row/col panels: 3m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
+               { { NULL,                      NULL,
+                   NULL,                      NULL,                      } },
+// 0100 row/col panels: 3m separated
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0101 row/col panels: real only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0110 row/col panels: imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0111 row/col panels: real+imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 1000 row/col panels: 1m-expanded (1e)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+// 1001 row/col panels: 1m-reordered (1r)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+};
+
+
 void bli_packm_blk_var1
     (
       obj_t*   c,
@@ -44,14 +110,632 @@ void bli_packm_blk_var1
       thrinfo_t* t
     )
 {
-#ifdef BLIS_ENABLE_JRIR_SLAB
+	num_t     dt_cp      = bli_obj_dt( c );

-	bli_packm_blk_var1sl( c, p, cntx, cntl, t );
+	struc_t   strucc     = bli_obj_struc( c );
+	doff_t    diagoffc   = bli_obj_diag_offset( c );
+	diag_t    diagc      = bli_obj_diag( c );
+	uplo_t    uploc      = bli_obj_uplo( c );
+	trans_t   transc     = bli_obj_conjtrans_status( c );
+	pack_t    schema     = bli_obj_pack_schema( p );
+	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
+	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
+	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );

-#else // BLIS_ENABLE_JRIR_RR
+	dim_t     m_p        = bli_obj_length( p );
+	dim_t     n_p        = bli_obj_width( p );
+	dim_t     m_max_p    = bli_obj_padded_length( p );
+	dim_t     n_max_p    = bli_obj_padded_width( p );

-	bli_packm_blk_var1rr( c, p, cntx, cntl, t );
+	void*     buf_c      = bli_obj_buffer_at_off( c );
+	inc_t     rs_c       = bli_obj_row_stride( c );
+	inc_t     cs_c       = bli_obj_col_stride( c );

+	void*     buf_p      = bli_obj_buffer_at_off( p );
+	inc_t     rs_p       = bli_obj_row_stride( p );
+	inc_t     cs_p       = bli_obj_col_stride( p );
+	inc_t     is_p       = bli_obj_imag_stride( p );
+	dim_t     pd_p       = bli_obj_panel_dim( p );
+	inc_t     ps_p       = bli_obj_panel_stride( p );
+
+	obj_t     kappa;
+	obj_t*    kappa_p;
+	void*     buf_kappa;
+
+	func_t*   packm_kers;
+	void*     packm_ker;
+
+	FUNCPTR_T f;
+
+
+	// Treatment of kappa (ie: packing during scaling) depends on
+	// whether we are executing an induced method.
+	if ( bli_is_nat_packed( schema ) )
+	{
+		// This branch is for native execution, where we assume that
+		// the micro-kernel will always apply the alpha scalar of the
+		// higher-level operation. Thus, we use BLIS_ONE for kappa so
+		// that the underlying packm implementation does not perform
+		// any scaling during packing.
+		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
+	}
+	else // if ( bli_is_ind_packed( schema ) )
+	{
+		// The value for kappa we use will depend on whether the scalar
+		// attached to A has a nonzero imaginary component. If it does,
+		// then we will apply the scalar during packing to facilitate
+		// implementing induced complex domain algorithms in terms of
+		// real domain micro-kernels. (In the aforementioned situation,
+		// applying a real scalar is easy, but applying a complex one is
+		// harder, so we avoid the need altogether with the code below.)
+		if ( bli_obj_scalar_has_nonzero_imag( p ) )
+		{
+			//printf( "applying non-zero imag kappa\n" );
+
+			// Detach the scalar.
+			bli_obj_scalar_detach( p, &kappa );
+
+			// Reset the attached scalar (to 1.0).
+			bli_obj_scalar_reset( p );
+
+			kappa_p = &kappa;
+		}
+		else
+		{
+			// If the internal scalar of A has only a real component, then
+			// we will apply it later (in the micro-kernel), and so we will
+			// use BLIS_ONE to indicate no scaling during packing.
+			kappa_p = &BLIS_ONE;
+		}
+
+		// Acquire the buffer to the kappa chosen above.
+		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
+	}
+
+
+	// Choose the correct func_t object based on the pack_t schema.
+#if 0
+	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
+	else if ( bli_is_3mi_packed( schema ) ||
+	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
+	else if ( bli_is_ro_packed( schema ) ||
+	          bli_is_io_packed( schema ) ||
+	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
+	else                                    packm_kers = packm_struc_cxk_kers;
+#else
+	// The original idea here was to read the packm_ukr from the context
+	// if it is non-NULL. The problem is, it requires that we be able to
+	// assume that the packm_ukr field is initialized to NULL, which it
+	// currently is not.
+
+	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
+
+	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// NULL (which is the default value after the context is created),
+		// we use the default lookup table to determine the right func_t
+		// for the current schema.
+		const dim_t i = bli_pack_schema_index( schema );
+
+		packm_kers = &packm_struc_cxk_kers[ i ];
+	}
+#if 0
+	else // cntx's packm func_t overrides
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// non-NULL (ie: assumed to be valid), we use that instead.
+		//packm_kers = bli_cntx_packm_ukrs( cntx );
+		packm_kers = cntx_packm_kers;
+	}
 #endif
+#endif
+
+	// Query the datatype-specific function pointer from the func_t object.
+	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_cp];
+
+	// Invoke the function.
+	f( strucc,
+	   diagoffc,
+	   diagc,
+	   uploc,
+	   transc,
+	   schema,
+	   invdiag,
+	   revifup,
+	   reviflo,
+	   m_p,
+	   n_p,
+	   m_max_p,
+	   n_max_p,
+	   buf_kappa,
+	   buf_c, rs_c, cs_c,
+	   buf_p, rs_p, cs_p,
+	          is_p,
+	          pd_p, ps_p,
+	   packm_ker,
+	   cntx,
+	   t );
 }

+
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       struc_t strucc, \
+       doff_t  diagoffc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       trans_t transc, \
+       pack_t  schema, \
+       bool_t  invdiag, \
+       bool_t  revifup, \
+       bool_t  reviflo, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+                  inc_t is_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       void*   packm_ker, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
+\
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict c_begin; \
+	ctype* restrict p_begin; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic, ip; \
+	dim_t           ic0, ip0; \
+	doff_t          ic_inc, ip_inc; \
+	doff_t          diagoffc_i; \
+	doff_t          diagoffc_inc; \
+	dim_t           panel_len_full; \
+	dim_t           panel_len_i; \
+	dim_t           panel_len_max; \
+	dim_t           panel_len_max_i; \
+	dim_t           panel_dim_i; \
+	dim_t           panel_dim_max; \
+	dim_t           panel_off_i; \
+	inc_t           vs_c; \
+	inc_t           ldc; \
+	inc_t           ldp, p_inc; \
+	dim_t*          m_panel_full; \
+	dim_t*          n_panel_full; \
+	dim_t*          m_panel_use; \
+	dim_t*          n_panel_use; \
+	dim_t*          m_panel_max; \
+	dim_t*          n_panel_max; \
+	conj_t          conjc; \
+	bool_t          row_stored; \
+	bool_t          col_stored; \
+	inc_t           is_p_use; \
+	dim_t           ss_num; \
+	dim_t           ss_den; \
+\
+	ctype* restrict c_use; \
+	ctype* restrict p_use; \
+	doff_t          diagoffp_i; \
+\
+\
+	/* If C is zeros and part of a triangular matrix, then we don't need
+	   to pack it. */ \
+	if ( bli_is_zeros( uploc ) && \
+	     bli_is_triangular( strucc ) ) return; \
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_negate_diag_offset( &diagoffc ); \
+		bli_toggle_uplo( &uploc ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	row_stored = bli_is_col_packed( schema ); \
+	col_stored = bli_is_row_packed( schema ); \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = rs_c; \
+		vs_c           = cs_c; \
+		diagoffc_inc   = -( doff_t )panel_dim_max; \
+		ldp            = rs_p; \
+		m_panel_full   = &m; \
+		n_panel_full   = &panel_dim_i; \
+		m_panel_use    = &panel_len_i; \
+		n_panel_use    = &panel_dim_i; \
+		m_panel_max    = &panel_len_max_i; \
+		n_panel_max    = &panel_dim_max; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = cs_c; \
+		vs_c           = rs_c; \
+		diagoffc_inc   = ( doff_t )panel_dim_max; \
+		ldp            = cs_p; \
+		m_panel_full   = &panel_dim_i; \
+		n_panel_full   = &n; \
+		m_panel_use    = &panel_dim_i; \
+		n_panel_use    = &panel_len_i; \
+		m_panel_max    = &panel_dim_max; \
+		n_panel_max    = &panel_len_max_i; \
+	} \
+\
+	/* Compute the storage stride scaling. Usually this is just 1. However,
+	   in the case of interleaved 3m, we need to scale by 3/2, and in the
+	   cases of real-only, imag-only, or summed-only, we need to scale by
+	   1/2. In both cases, we are compensating for the fact that pointer
+	   arithmetic occurs in terms of complex elements rather than real
+	   elements. */ \
+	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
+	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else                                    { ss_num = 1; ss_den = 1; } \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
+	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
+	{ \
+		ic0    = (n_iter - 1) * panel_dim_max; \
+		ic_inc = -panel_dim_max; \
+		ip0    = n_iter - 1; \
+		ip_inc = -1; \
+	} \
+	else \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+		ip0    = 0; \
+		ip_inc = 1; \
+	} \
+\
+	p_begin = p_cast; \
+\
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, ip += ip_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
+		c_begin     = c_cast   + (ic  )*vs_c; \
+\
+		if ( bli_is_triangular( strucc ) &&  \
+		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is completely unstored (ie: zero). If the panel
+			   is unstored, we do nothing. (Notice that we don't even
+			   increment p_begin.) */ \
+\
+			continue; \
+		} \
+		else if ( bli_is_triangular( strucc ) &&  \
+		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is diagonal-intersecting. Notice that we
+			   cannot bury the following conditional logic into
+			   packm_struc_cxk() because we need to know the value of
+			   panel_len_max_i so we can properly increment p_inc. */ \
+\
+			/* Sanity check. Diagonals should not intersect the short end of
+			   a micro-panel. If they do, then somehow the constraints on
+			   cache blocksizes being a whole multiple of the register
+			   blocksizes was somehow violated. */ \
+			if ( ( col_stored && diagoffc_i < 0 ) || \
+			     ( row_stored && diagoffc_i > 0 ) ) \
+				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+\
+			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
+			          ( col_stored && bli_is_lower( uploc ) ) )  \
+			{ \
+				panel_off_i     = 0; \
+				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
+				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
+				                           panel_len_max ); \
+				diagoffp_i      = diagoffc_i; \
+			} \
+			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
+			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
+			{ \
+				panel_off_i     = bli_abs( diagoffc_i ); \
+				panel_len_i     = panel_len_full - panel_off_i; \
+				panel_len_max_i = panel_len_max  - panel_off_i; \
+				diagoffp_i      = 0; \
+			} \
+\
+			c_use = c_begin + (panel_off_i  )*ldc; \
+			p_use = p_begin; \
+\
+			/* We need to re-compute the imaginary stride as a function of
+			   panel_len_max_i since triangular packed matrices have panels
+			   of varying lengths. NOTE: This imaginary stride value is
+			   only referenced by the packm kernels for induced methods. */ \
+			is_p_use  = ldp * panel_len_max_i; \
+\
+			/* We nudge the imaginary stride up by one if it is odd. */ \
+			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
+\
+			/* NOTE: We MUST use round-robin partitioning when packing
+			   micropanels of a triangular matrix. Hermitian/symmetric
+			   and general packing may use slab or round-robin, depending
+			   on which was selected at configure-time. */ \
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffp_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is usually LESS than ps_p because triangular
+			   matrices usually have several micro-panels that are shorter
+			   than a "full" micro-panel. */ \
+			p_inc = ( is_p_use * ss_num ) / ss_den; \
+		} \
+		else if ( bli_is_herm_or_symm( strucc ) ) \
+		{ \
+			/* This case executes if the panel belongs to a Hermitian or
+			   symmetric matrix, which includes stored, unstored, and
+			   diagonal-intersecting panels. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffc_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			p_inc = ps_p; \
+		} \
+		else \
+		{ \
+			/* This case executes if the panel is general, or, if the
+			   panel is part of a triangular matrix and is neither unstored
+			   (ie: zero) nor diagonal-intersecting. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( BLIS_GENERAL, \
+				                0, \
+				                diagc, \
+				                BLIS_DENSE, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is equivalent to ps_p. */ \
+			p_inc = ps_p; \
+		} \
+\
+		p_begin += p_inc; \
+\
+	} \
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
+
+
+
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+if ( col_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+*/
+/*
+if ( col_stored ) { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+else { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+*/
+/*
+		if ( bli_is_4mi_packed( schema ) ) { \
+		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
+		if ( col_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		if ( row_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		} \
+*/
+/*
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+*/
+/*
+		if ( row_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		inc_t is_b = rs_p * *m_panel_max; \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
+/*
+		if ( col_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
--- a/frame/1m/packm/bli_packm_blk_var1rr.c
+++ b/frame/1m/packm/bli_packm_blk_var1rr.c
@@ -1,737 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       struc_t strucc,
-       doff_t  diagoffc,
-       diag_t  diagc,
-       uplo_t  uploc,
-       trans_t transc,
-       pack_t  schema,
-       bool_t  invdiag,
-       bool_t  revifup,
-       bool_t  reviflo,
-       dim_t   m,
-       dim_t   n,
-       dim_t   m_max,
-       dim_t   n_max,
-       void*   kappa,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       void*   p, inc_t rs_p, inc_t cs_p,
-                  inc_t is_p,
-                  dim_t pd_p, inc_t ps_p,
-       void*   packm_ker,
-       cntx_t* cntx,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1rr);
-
-
-static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
-{
-                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
-// 0000 row/col panels
-               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
-                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
-// 0001 row/col panels: 4m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_4mi,
-                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
-// 0010 row/col panels: 3m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
-               { { NULL,                      NULL,
-                   NULL,                      NULL,                      } },
-// 0100 row/col panels: 3m separated
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0101 row/col panels: real only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0110 row/col panels: imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0111 row/col panels: real+imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 1000 row/col panels: 1m-expanded (1e)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-// 1001 row/col panels: 1m-reordered (1r)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-};
-
-
-void bli_packm_blk_var1rr
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
-     )
-{
-	num_t     dt_cp      = bli_obj_dt( c );
-
-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
-	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
-
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
-
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
-
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	obj_t     kappa;
-	obj_t*    kappa_p;
-	void*     buf_kappa;
-
-	func_t*   packm_kers;
-	void*     packm_ker;
-
-	FUNCPTR_T f;
-
-
-	// Treatment of kappa (ie: packing during scaling) depends on
-	// whether we are executing an induced method.
-	if ( bli_is_nat_packed( schema ) )
-	{
-		// This branch is for native execution, where we assume that
-		// the micro-kernel will always apply the alpha scalar of the
-		// higher-level operation. Thus, we use BLIS_ONE for kappa so
-		// that the underlying packm implementation does not perform
-		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-	}
-	else // if ( bli_is_ind_packed( schema ) )
-	{
-		// The value for kappa we use will depend on whether the scalar
-		// attached to A has a nonzero imaginary component. If it does,
-		// then we will apply the scalar during packing to facilitate
-		// implementing induced complex domain algorithms in terms of
-		// real domain micro-kernels. (In the aforementioned situation,
-		// applying a real scalar is easy, but applying a complex one is
-		// harder, so we avoid the need altogether with the code below.)
-		if ( bli_obj_scalar_has_nonzero_imag( p ) )
-		{
-			//printf( "applying non-zero imag kappa\n" );
-
-			// Detach the scalar.
-			bli_obj_scalar_detach( p, &kappa );
-
-			// Reset the attached scalar (to 1.0).
-			bli_obj_scalar_reset( p );
-
-			kappa_p = &kappa;
-		}
-		else
-		{
-			// If the internal scalar of A has only a real component, then
-			// we will apply it later (in the micro-kernel), and so we will
-			// use BLIS_ONE to indicate no scaling during packing.
-			kappa_p = &BLIS_ONE;
-		}
-
-		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
-	}
-
-
-	// Choose the correct func_t object based on the pack_t schema.
-#if 0
-	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
-	else if ( bli_is_3mi_packed( schema ) ||
-	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
-	else if ( bli_is_ro_packed( schema ) ||
-	          bli_is_io_packed( schema ) ||
-	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
-	else                                    packm_kers = packm_struc_cxk_kers;
-#else
-	// The original idea here was to read the packm_ukr from the context
-	// if it is non-NULL. The problem is, it requires that we be able to
-	// assume that the packm_ukr field is initialized to NULL, which it
-	// currently is not.
-
-	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
-
-	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// NULL (which is the default value after the context is created),
-		// we use the default lookup table to determine the right func_t
-		// for the current schema.
-		const dim_t i = bli_pack_schema_index( schema );
-
-		packm_kers = &packm_struc_cxk_kers[ i ];
-	}
-#if 0
-	else // cntx's packm func_t overrides
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// non-NULL (ie: assumed to be valid), we use that instead.
-		//packm_kers = bli_cntx_packm_ukrs( cntx );
-		packm_kers = cntx_packm_kers;
-	}
-#endif
-#endif
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   cntx,
-	   t );
-}
-
-
-#undef  GENTFUNCR
-#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       pack_t  schema, \
-       bool_t  invdiag, \
-       bool_t  revifup, \
-       bool_t  reviflo, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       void*   packm_ker, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool_t          row_stored; \
-	bool_t          col_stored; \
-	inc_t           is_p_use; \
-	dim_t           ss_num; \
-	dim_t           ss_den; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the storage stride scaling. Usually this is just 1. However,
-	   in the case of interleaved 3m, we need to scale by 3/2, and in the
-	   cases of real-only, imag-only, or summed-only, we need to scale by
-	   1/2. In both cases, we are compensating for the fact that pointer
-	   arithmetic occurs in terms of complex elements rather than real
-	   elements. */ \
-	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
-	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else                                    { ss_num = 1; ss_den = 1; } \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (n_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = n_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. */ \
-	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			/* We need to re-compute the imaginary stride as a function of
-			   panel_len_max_i since triangular packed matrices have panels
-			   of varying lengths. NOTE: This imaginary stride value is
-			   only referenced by the packm kernels for induced methods. */ \
-			is_p_use  = ldp * panel_len_max_i; \
-\
-			/* We nudge the imaginary stride up by one if it is odd. */ \
-			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
-\
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = ( is_p_use * ss_num ) / ss_den; \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
-			{ \
-/*
-printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
-*/ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			p_inc = ps_p; \
-		} \
-\
-		p_begin += p_inc; \
-\
-	} \
-/*
-printf( "thread %d: done\n", (int)tid ); \
-*/ \
-}
-
-INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1rr )
-
-
-
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-*/
-/*
-if ( col_stored ) { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-else { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-*/
-/*
-		if ( bli_is_4mi_packed( schema ) ) { \
-		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
-		if ( col_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		if ( row_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		} \
-*/
-/*
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-*/
-/*
-		if ( row_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		inc_t is_b = rs_p * *m_panel_max; \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
-/*
-		if ( col_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
--- a/frame/1m/packm/bli_packm_blk_var1sl.c
+++ b/frame/1m/packm/bli_packm_blk_var1sl.c
@@ -1,737 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       struc_t strucc,
-       doff_t  diagoffc,
-       diag_t  diagc,
-       uplo_t  uploc,
-       trans_t transc,
-       pack_t  schema,
-       bool_t  invdiag,
-       bool_t  revifup,
-       bool_t  reviflo,
-       dim_t   m,
-       dim_t   n,
-       dim_t   m_max,
-       dim_t   n_max,
-       void*   kappa,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       void*   p, inc_t rs_p, inc_t cs_p,
-                  inc_t is_p,
-                  dim_t pd_p, inc_t ps_p,
-       void*   packm_ker,
-       cntx_t* cntx,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1sl);
-
-
-static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
-{
-                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
-// 0000 row/col panels
-               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
-                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
-// 0001 row/col panels: 4m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_4mi,
-                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
-// 0010 row/col panels: 3m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
-               { { NULL,                      NULL,
-                   NULL,                      NULL,                      } },
-// 0100 row/col panels: 3m separated
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0101 row/col panels: real only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0110 row/col panels: imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0111 row/col panels: real+imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 1000 row/col panels: 1m-expanded (1e)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-// 1001 row/col panels: 1m-reordered (1r)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-};
-
-
-void bli_packm_blk_var1sl
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
-     )
-{
-	num_t     dt_cp      = bli_obj_dt( c );
-
-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
-	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
-
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
-
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
-
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	obj_t     kappa;
-	obj_t*    kappa_p;
-	void*     buf_kappa;
-
-	func_t*   packm_kers;
-	void*     packm_ker;
-
-	FUNCPTR_T f;
-
-
-	// Treatment of kappa (ie: packing during scaling) depends on
-	// whether we are executing an induced method.
-	if ( bli_is_nat_packed( schema ) )
-	{
-		// This branch is for native execution, where we assume that
-		// the micro-kernel will always apply the alpha scalar of the
-		// higher-level operation. Thus, we use BLIS_ONE for kappa so
-		// that the underlying packm implementation does not perform
-		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-	}
-	else // if ( bli_is_ind_packed( schema ) )
-	{
-		// The value for kappa we use will depend on whether the scalar
-		// attached to A has a nonzero imaginary component. If it does,
-		// then we will apply the scalar during packing to facilitate
-		// implementing induced complex domain algorithms in terms of
-		// real domain micro-kernels. (In the aforementioned situation,
-		// applying a real scalar is easy, but applying a complex one is
-		// harder, so we avoid the need altogether with the code below.)
-		if ( bli_obj_scalar_has_nonzero_imag( p ) )
-		{
-			//printf( "applying non-zero imag kappa\n" );
-
-			// Detach the scalar.
-			bli_obj_scalar_detach( p, &kappa );
-
-			// Reset the attached scalar (to 1.0).
-			bli_obj_scalar_reset( p );
-
-			kappa_p = &kappa;
-		}
-		else
-		{
-			// If the internal scalar of A has only a real component, then
-			// we will apply it later (in the micro-kernel), and so we will
-			// use BLIS_ONE to indicate no scaling during packing.
-			kappa_p = &BLIS_ONE;
-		}
-
-		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
-	}
-
-
-	// Choose the correct func_t object based on the pack_t schema.
-#if 0
-	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
-	else if ( bli_is_3mi_packed( schema ) ||
-	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
-	else if ( bli_is_ro_packed( schema ) ||
-	          bli_is_io_packed( schema ) ||
-	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
-	else                                    packm_kers = packm_struc_cxk_kers;
-#else
-	// The original idea here was to read the packm_ukr from the context
-	// if it is non-NULL. The problem is, it requires that we be able to
-	// assume that the packm_ukr field is initialized to NULL, which it
-	// currently is not.
-
-	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
-
-	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// NULL (which is the default value after the context is created),
-		// we use the default lookup table to determine the right func_t
-		// for the current schema.
-		const dim_t i = bli_pack_schema_index( schema );
-
-		packm_kers = &packm_struc_cxk_kers[ i ];
-	}
-#if 0
-	else // cntx's packm func_t overrides
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// non-NULL (ie: assumed to be valid), we use that instead.
-		//packm_kers = bli_cntx_packm_ukrs( cntx );
-		packm_kers = cntx_packm_kers;
-	}
-#endif
-#endif
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   cntx,
-	   t );
-}
-
-
-#undef  GENTFUNCR
-#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       pack_t  schema, \
-       bool_t  invdiag, \
-       bool_t  revifup, \
-       bool_t  reviflo, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       void*   packm_ker, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           n_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool_t          row_stored; \
-	bool_t          col_stored; \
-	inc_t           is_p_use; \
-	dim_t           ss_num; \
-	dim_t           ss_den; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the storage stride scaling. Usually this is just 1. However,
-	   in the case of interleaved 3m, we need to scale by 3/2, and in the
-	   cases of real-only, imag-only, or summed-only, we need to scale by
-	   1/2. In both cases, we are compensating for the fact that pointer
-	   arithmetic occurs in terms of complex elements rather than real
-	   elements. */ \
-	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
-	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else                                    { ss_num = 1; ss_den = 1; } \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (n_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = n_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-\
-	/* Query the number of threads and thread ids from the current thread's
-	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
-\
-	dim_t it_start, it_end, it_inc; \
-\
-	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. */ \
-	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
-\
-	/* Iterate over every logical micropanel in the source matrix. */ \
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			/* We need to re-compute the imaginary stride as a function of
-			   panel_len_max_i since triangular packed matrices have panels
-			   of varying lengths. NOTE: This imaginary stride value is
-			   only referenced by the packm kernels for induced methods. */ \
-			is_p_use  = ldp * panel_len_max_i; \
-\
-			/* We nudge the imaginary stride up by one if it is odd. */ \
-			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
-\
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = ( is_p_use * ss_num ) / ss_den; \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
-			{ \
-/*
-printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
-*/ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			p_inc = ps_p; \
-		} \
-\
-		p_begin += p_inc; \
-\
-	} \
-/*
-printf( "thread %d: done\n", (int)tid ); \
-*/ \
-}
-
-INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1sl )
-
-
-
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-*/
-/*
-if ( col_stored ) { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-else { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-*/
-/*
-		if ( bli_is_4mi_packed( schema ) ) { \
-		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
-		if ( col_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		if ( row_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		} \
-*/
-/*
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-*/
-/*
-		if ( row_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		inc_t is_b = rs_p * *m_panel_max; \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
-/*
-		if ( col_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/1m/packm/bli_packm_thrinfo.h
@@ -51,7 +51,18 @@
 \
 	( start <= i && i < end )

+// Define a general-purpose version of bli_packm_my_iter() whose definition
+// depends on whether slab or round-robin partitioning was requested at
+// configure-time.
+#ifdef BLIS_ENABLE_JRIR_SLAB

+  #define bli_packm_my_iter bli_packm_my_iter_sl
+
+#else // BLIS_ENABLE_JRIR_RR
+
+  #define bli_packm_my_iter bli_packm_my_iter_rr
+
+#endif


 //
--- a/frame/1m/packm/bli_packm_var.h
+++ b/frame/1m/packm/bli_packm_var.h
@@ -49,10 +49,8 @@ void PASTEMAC0(opname) \
       thrinfo_t* t  \
     );

-GENPROT( packm_unb_var1   )
-GENPROT( packm_blk_var1   )
-GENPROT( packm_blk_var1sl )
-GENPROT( packm_blk_var1rr )
+GENPROT( packm_unb_var1 )
+GENPROT( packm_blk_var1 )

 //
 // Prototype BLAS-like interfaces with void pointer operands.
@@ -108,6 +106,5 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( packm_blk_var1sl )
-INSERT_GENTPROT_BASIC0( packm_blk_var1rr )
+INSERT_GENTPROT_BASIC0( packm_blk_var1 )

--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -39,26 +39,32 @@

 // gemm

+// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

 // herk

+// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

 // trmm

+// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

-#define bli_trmm_my_iter( index, thread ) \
+#define bli_trmm_my_iter_rr( index, thread ) \
 \
 	( index % thread->n_way == thread->work_id % thread->n_way )

 // trsm

-#define bli_trsm_my_iter( index, thread ) \
+#define bli_trsm_my_iter_rr( index, thread ) \
 \
 	( index % thread->n_way == thread->work_id % thread->n_way )

--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -58,30 +58,15 @@ cntl_t* bli_gemmbp_cntl_create
 	void* packa_fp;
 	void* packb_fp;

-#ifdef BLIS_ENABLE_JRIR_SLAB
-
 	// Use the function pointers to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
-	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2sl;
-	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2sl;
-	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2sl;
-	else                            macro_kernel_fp = NULL;
+	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
+	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
+	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
+	else /* should never execute */ macro_kernel_fp = NULL;

-	packa_fp = bli_packm_blk_var1sl;
-	packb_fp = bli_packm_blk_var1sl;
-
-#else // BLIS_ENABLE_JRIR_RR
-
-	// Use the function pointers to the macrokernels that use round-robin
-	// assignment of micropanels to threads in the jr and ir loops.
-	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2rr;
-	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2rr;
-	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2rr;
-	else                            macro_kernel_fp = NULL;
-
-	packa_fp = bli_packm_blk_var1rr;
-	packb_fp = bli_packm_blk_var1rr;
-#endif
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;

 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
--- a/frame/3/gemm/bli_gemm_int.c
+++ b/frame/3/gemm/bli_gemm_int.c
@@ -116,8 +116,7 @@ void bli_gemm_int
 		if ( im != BLIS_NAT )
 		{
 			if ( im == BLIS_4M1B )
-			if ( f == bli_gemm_ker_var2sl ||
-			     f == bli_gemm_ker_var2rr ) f = bli_gemm4mb_ker_var2;
+			if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
 		}
 	}

--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -0,0 +1,379 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
+
+
+void bli_gemm_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// If 1m is being employed on a column- or row-stored matrix with a
+	// real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -60,8 +60,7 @@ GENPROT( gemm_packb )

 GENPROT( gemm_ker_var1 )

-GENPROT( gemm_ker_var2sl )
-GENPROT( gemm_ker_var2rr )
+GENPROT( gemm_ker_var2 )

 // Headers for induced algorithms:
 GENPROT( gemm4mb_ker_var2 ) // 4m1b
@@ -93,8 +92,7 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
-INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2 )

 // Headers for induced algorithms:
 INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b
--- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
--- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/bli_herk_l_ker_var2.c
@@ -0,0 +1,555 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+
+
+void bli_herk_l_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the initial rectangular region of C (if it exists).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/bli_herk_u_ker_var2.c
@@ -0,0 +1,558 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+
+
+void bli_herk_u_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+    f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   initial triangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Determine the thread range and increment for the 2nd loop of the
+	   remaining rectangular region of C (and also use default partitioning
+	   for the 1st loop).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+
--- a/frame/3/herk/bli_herk_var.h
+++ b/frame/3/herk/bli_herk_var.h
@@ -56,13 +56,10 @@ void PASTEMAC0(opname) \
 //GENPROT( herk_blk_var2 )
 //GENPROT( herk_blk_var3 )

-GENPROT( herk_x_ker_var2sl )
-GENPROT( herk_x_ker_var2rr )
+GENPROT( herk_x_ker_var2 )

-GENPROT( herk_l_ker_var2sl )
-GENPROT( herk_l_ker_var2rr )
-GENPROT( herk_u_ker_var2sl )
-GENPROT( herk_u_ker_var2rr )
+GENPROT( herk_l_ker_var2 )
+GENPROT( herk_u_ker_var2 )
 //GENPROT( herk_packa )
 //GENPROT( herk_packb )

@@ -94,8 +91,6 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( herk_l_ker_var2sl )
-INSERT_GENTPROT_BASIC0( herk_l_ker_var2rr )
-INSERT_GENTPROT_BASIC0( herk_u_ker_var2sl )
-INSERT_GENTPROT_BASIC0( herk_u_ker_var2rr )
+INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
+INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )

--- a/frame/3/herk/bli_herk_x_ker_var2.c
+++ b/frame/3/herk/bli_herk_x_ker_var2.c
@@ -35,12 +35,12 @@

 #include "blis.h"

-static gemm_var_oft vars_sl[2] =
+static gemm_var_oft vars[2] =
 {
-	bli_herk_l_ker_var2sl, bli_herk_u_ker_var2sl,
+	bli_herk_l_ker_var2, bli_herk_u_ker_var2,
 };

-void bli_herk_x_ker_var2sl
+void bli_herk_x_ker_var2
     (
       obj_t*  a,
       obj_t*  ah,
@@ -59,48 +59,7 @@ void bli_herk_x_ker_var2sl
 	else                              uplo = 1;

 	// Index into the variant array to extract the correct function pointer.
-	f = vars_sl[uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  ah,
-	  c,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
-}
-
-// -----------------------------------------------------------------------------
-
-static gemm_var_oft vars_rr[2] =
-{
-	bli_herk_l_ker_var2rr, bli_herk_u_ker_var2rr,
-};
-
-void bli_herk_x_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	bool_t       uplo;
-	gemm_var_oft f;
-
-	// Set a bool based on the uplo field of C's root object.
-	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
-	else                              uplo = 1;
-
-	// Index into the variant array to extract the correct function pointer.
-	f = vars_rr[uplo];
+	f = vars[uplo];

 	// Call the macrokernel.
 	f
--- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c
+++ b/frame/3/herk/other/bli_herk_l_ker_var2rr.c
--- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c
+++ b/frame/3/herk/other/bli_herk_l_ker_var2sl.c
--- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c
+++ b/frame/3/herk/other/bli_herk_u_ker_var2rr.c
--- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c
+++ b/frame/3/herk/other/bli_herk_u_ker_var2sl.c
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -0,0 +1,533 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
+
+
+void bli_trmm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+       NOTE: The definition of bli_thread_range_jrir() will depend on whether
+       slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -0,0 +1,541 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
+
+
+void bli_trmm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -0,0 +1,598 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
+
+
+void bli_trmm_rl_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the initial rectangular region of B (if it exists).
+       NOTE: The definition of bli_thread_range_jrir() will depend on whether
+       slab or round-robin partitioning was requested at configure-time. \
+       NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -0,0 +1,618 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
+
+
+void bli_trmm_ru_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j, jb0; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter_tri; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the remaining rectangular region of B.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+\
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -56,17 +56,12 @@ void PASTEMAC0(opname) \
 //GENPROT( trmm_blk_var2 )
 //GENPROT( trmm_blk_var3 )

-GENPROT( trmm_xx_ker_var2sl )
-GENPROT( trmm_xx_ker_var2rr )
+GENPROT( trmm_xx_ker_var2 )

-GENPROT( trmm_ll_ker_var2sl )
-GENPROT( trmm_ll_ker_var2rr )
-GENPROT( trmm_lu_ker_var2sl )
-GENPROT( trmm_lu_ker_var2rr )
-GENPROT( trmm_rl_ker_var2sl )
-GENPROT( trmm_rl_ker_var2rr )
-GENPROT( trmm_ru_ker_var2sl )
-GENPROT( trmm_ru_ker_var2rr )
+GENPROT( trmm_ll_ker_var2 )
+GENPROT( trmm_lu_ker_var2 )
+GENPROT( trmm_rl_ker_var2 )
+GENPROT( trmm_ru_ker_var2 )


 //
@@ -96,12 +91,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2rr )
-INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2rr )
-INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2rr )
-INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2rr )
+INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
+INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
+INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
+INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )

--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -35,13 +35,13 @@

 #include "blis.h"

-static gemm_var_oft vars_sl[2][2] =
+static gemm_var_oft vars[2][2] =
 {
-	{ bli_trmm_ll_ker_var2sl, bli_trmm_lu_ker_var2sl },
-	{ bli_trmm_rl_ker_var2sl, bli_trmm_ru_ker_var2sl }
+	{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
+	{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
 };

-void bli_trmm_xx_ker_var2sl
+void bli_trmm_xx_ker_var2
     (
       obj_t*  a,
       obj_t*  b,
@@ -73,62 +73,7 @@ void bli_trmm_xx_ker_var2sl
 	}

 	// Index into the variant array to extract the correct function pointer.
-	f = vars_sl[side][uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  b,
-	  c,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
-}
-
-// -----------------------------------------------------------------------------
-
-static gemm_var_oft vars_rr[2][2] =
-{
-	{ bli_trmm_ll_ker_var2rr, bli_trmm_lu_ker_var2rr },
-	{ bli_trmm_rl_ker_var2rr, bli_trmm_ru_ker_var2rr }
-};
-
-void bli_trmm_xx_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	bool_t       side;
-	bool_t       uplo;
-	gemm_var_oft f;
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		side = 0;
-		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		side = 1;
-		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-
-	// Index into the variant array to extract the correct function pointer.
-	f = vars_rr[side][uplo];
+	f = vars[side][uplo];

 	// Call the macrokernel.
 	f
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -58,24 +58,12 @@ cntl_t* bli_trsm_l_cntl_create
 	void* packa_fp;
 	void* packb_fp;

-#ifdef BLIS_ENABLE_JRIR_SLAB
-
 	// Use the function pointer to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
-	macro_kernel_p = bli_trsm_xx_ker_var2sl;
+	macro_kernel_p = bli_trsm_xx_ker_var2;

-	packa_fp = bli_packm_blk_var1sl;
-	packb_fp = bli_packm_blk_var1sl;
-
-#else // BLIS_ENABLE_JRIR_RR
-
-	// Use the function pointer to the macrokernels that use round-robin
-	// assignment of micropanels to threads in the jr and ir loops.
-	macro_kernel_p = bli_trsm_xx_ker_var2rr;
-
-	packa_fp = bli_packm_blk_var1rr;
-	packb_fp = bli_packm_blk_var1rr;
-#endif
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;

 	const opid_t family = BLIS_TRSM;

@@ -162,16 +150,11 @@ cntl_t* bli_trsm_r_cntl_create
       pack_t schema_b
     )
 {
-	// trsm macrokernels are presently disabled for right-side execution,
-	// so it doesn't matter which function pointer we use here (sl or rr).
-	// To be safe, we'll insert an abort() guard to alert the developers
-	// of this should right-side macrokernels ever be re-enabled.
-	void* macro_kernel_p = bli_trsm_xx_ker_var2sl;
+	// NOTE: trsm macrokernels are presently disabled for right-side execution.
+	void* macro_kernel_p = bli_trsm_xx_ker_var2;

-	void* packa_fp = bli_packm_blk_var1sl;
-	void* packb_fp = bli_packm_blk_var1sl;
-
-	bli_abort();
+	void* packa_fp = bli_packm_blk_var1;
+	void* packb_fp = bli_packm_blk_var1;

 	const opid_t family = BLIS_TRSM;

--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -0,0 +1,604 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
+
+
+void bli_trsm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
+
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -0,0 +1,585 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
+
+
+void bli_trsm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
+
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -428,7 +428,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a11; \
 				ctype* restrict a12; \
@@ -514,7 +514,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a2; \
 \
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -421,7 +421,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a10; \
 				ctype* restrict a11; \
@@ -507,7 +507,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a2; \
 \
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -58,14 +58,10 @@ GENPROT( trsm_blk_var3 )
 GENPROT( trsm_packa )
 GENPROT( trsm_packb )

-GENPROT( trsm_xx_ker_var2sl )
-GENPROT( trsm_xx_ker_var2rr )
-
-GENPROT( trsm_ll_ker_var2sl )
-GENPROT( trsm_ll_ker_var2rr )
-GENPROT( trsm_lu_ker_var2sl )
-GENPROT( trsm_lu_ker_var2rr )
+GENPROT( trsm_xx_ker_var2 )

+GENPROT( trsm_ll_ker_var2 )
+GENPROT( trsm_lu_ker_var2 )
 GENPROT( trsm_rl_ker_var2 )
 GENPROT( trsm_ru_ker_var2 )

@@ -97,11 +93,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2rr )
-INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2sl )
-INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2rr )
-
+INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
+INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
 INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
 INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )

--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -35,13 +35,13 @@

 #include "blis.h"

-static trsm_var_oft vars_sl[2][2] =
+static trsm_var_oft vars[2][2] =
 {
-	{ bli_trsm_ll_ker_var2sl, bli_trsm_lu_ker_var2sl },
-	{ bli_trsm_rl_ker_var2  , bli_trsm_ru_ker_var2   }
+	{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
+	{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
 };

-void bli_trsm_xx_ker_var2sl
+void bli_trsm_xx_ker_var2
     (
       obj_t*  a,
       obj_t*  b,
@@ -73,62 +73,7 @@ void bli_trsm_xx_ker_var2sl
 	}

 	// Index into the variant array to extract the correct function pointer.
-	f = vars_sl[side][uplo];
-
-	// Call the macrokernel.
-	f
-	(
-	  a,
-	  b,
-	  c,
-	  cntx,
-	  rntm,
-	  cntl,
-	  thread
-	);
-}
-
-// -----------------------------------------------------------------------------
-
-static trsm_var_oft vars_rr[2][2] =
-{
-	{ bli_trsm_ll_ker_var2rr, bli_trsm_lu_ker_var2rr },
-	{ bli_trsm_rl_ker_var2  , bli_trsm_ru_ker_var2   }
-};
-
-void bli_trsm_xx_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	bool_t       side;
-	bool_t       uplo;
-	trsm_var_oft f;
-
-	// Set two bools: one based on the implied side parameter (the structure
-	// of the root object) and one based on the uplo field of the triangular
-	// matrix's root object (whether that is matrix A or matrix B).
-	if ( bli_obj_root_is_triangular( a ) )
-	{
-		side = 0;
-		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-	else // if ( bli_obj_root_is_triangular( b ) )
-	{
-		side = 1;
-		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
-		else                              uplo = 1;
-	}
-
-	// Index into the variant array to extract the correct function pointer.
-	f = vars_rr[side][uplo];
+	f = vars[side][uplo];

 	// Call the macrokernel.
 	f
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -804,6 +804,15 @@ static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth
 	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
 }

+static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+{
+#ifdef BLIS_ENABLE_JRIR_SLAB
+	return bli_is_last_iter_sl( i, end_iter, tid, nth );
+#else // BLIS_ENABLE_JRIR_RR
+	return bli_is_last_iter_rr( i, end_iter, tid, nth );
+#endif
+}
+

 // packbuf_t-related

--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -59,32 +59,6 @@ void bli_thread_finalize( void )
 {
 }

-// -----------------------------------------------------------------------------
-#if 0
-void bli_thread_range_jrir
-     (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-	   dim_t*     inc
-     )
-{
-//#ifdef BLIS_JRIR_INTERLEAVE
-#if 1
-	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thread_work_id( thread );
-	*inc   = bli_thread_n_way( thread );
-	*end   = n;
-#else
-	// Use contiguous slab partitioning for jr/ir loops.
-	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
-	*inc = 1;
-#endif
-}
-#endif
 // -----------------------------------------------------------------------------

 void bli_thread_range_sub
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -57,19 +57,6 @@ void bli_thread_finalize( void );
 #endif

 // Thread range-related prototypes.
-#if 0
-void bli_thread_range_jrir
-     (
-       thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-	   dim_t*     inc
-     );
-#endif
-// -----------------------------------------------------------------------------

 void bli_thread_range_sub
     (
@@ -261,7 +248,6 @@ static void bli_thread_range_jrir_sl
 	*inc = 1;
 }

-#if 0
 static void bli_thread_range_jrir
     (
       thrinfo_t* thread,
@@ -273,6 +259,9 @@ static void bli_thread_range_jrir
       dim_t*     inc
     )
 {
+	// Define a general-purpose version of bli_thread_range_jrir() whose
+	// definition depends on whether slab or round-robin partitioning was
+	// requested at configure-time.
 #ifdef BLIS_ENABLE_JRIR_SLAB
 	bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
 #else
@@ -280,6 +269,7 @@ static void bli_thread_range_jrir
 #endif
 }

+#if 0
 static void bli_thread_range_weighted_jrir
     (
       thrinfo_t* thread,
--- a/sandbox/ref99/cntl/blx_gemm_cntl.c
+++ b/sandbox/ref99/cntl/blx_gemm_cntl.c
@@ -59,24 +59,10 @@ cntl_t* blx_gemmbp_cntl_create
 	void* packa_fp;
 	void* packb_fp;

-#ifdef BLIS_ENABLE_JRIR_SLAB
+	macro_kernel_fp = blx_gemm_ker_var2;

-	// Use the function pointers to the macrokernels that use slab
-	// assignment of micropanels to threads in the jr and ir loops.
-	macro_kernel_fp = blx_gemm_ker_var2sl;
-
-	packa_fp = bli_packm_blk_var1sl;
-	packb_fp = bli_packm_blk_var1sl;
-
-#else // BLIS_ENABLE_JRIR_RR
-
-	// Use the function pointers to the macrokernels that use round-robin
-	// assignment of micropanels to threads in the jr and ir loops.
-	macro_kernel_fp = bli_gemm_ker_var2rr;
-
-	packa_fp = bli_packm_blk_var1rr;
-	packb_fp = bli_packm_blk_var1rr;
-#endif
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;

 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
--- a/sandbox/ref99/vars/blx_gemm_ker_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c
@@ -0,0 +1,375 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+// Function pointer type for datatype-specific functions.
+typedef void (*gemm_fp)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+// Function pointer array for datatype-specific functions.
+static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
+{
+    PASTECH2(blx_,s,gemm_ker_var2),
+    PASTECH2(blx_,c,gemm_ker_var2),
+    PASTECH2(blx_,d,gemm_ker_var2),
+    PASTECH2(blx_,z,gemm_ker_var2)
+};
+
+
+void blx_gemm_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	gemm_fp   f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(blx_,ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+#if 0
+GENTFUNC( float,    s, gemm_ker_var2 )
+GENTFUNC( double,   d, gemm_ker_var2 )
+GENTFUNC( scomplex, c, gemm_ker_var2 )
+GENTFUNC( dcomplex, z, gemm_ker_var2 )
+#else
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+#endif
+
--- a/sandbox/ref99/vars/blx_gemm_var.h
+++ b/sandbox/ref99/vars/blx_gemm_var.h
@@ -58,8 +58,7 @@ GENPROT( gemm_blk_var3 )
 GENPROT( gemm_packa )
 GENPROT( gemm_packb )

-GENPROT( gemm_ker_var2sl )
-GENPROT( gemm_ker_var2rr )
+GENPROT( gemm_ker_var2 )

 //
 // Prototype BLAS-like interfaces with void pointer operands.
@@ -87,6 +86,5 @@ void PASTECH2(blx_,ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
-INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2 )

--- a/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c
+++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c
--- a/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c
+++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c