Multithreading optimizations for l3 macrokernels.

Details: - Adjusted the method by which micropanels are assigned to threads in the 2nd (jr) and 1st (ir) loops around the microkernel to (mostly) employ contiguous "slab" partitioning rather than interleaved (round robin) partitioning. The new partitioning schemes and related details for specific families of operations are listed below: - gemm: slab partitioning. - herk: slab partitioning for region corresponding to non-triangular region of C; round robin partitioning for triangular region. - trmm: slab partitioning for region corresponding to non-triangular region of B; round robin partitioning for triangular region. (NOTE: This affects both left- and right-side macrokernels: trmm_ll, trmm_lu, trmm_rl, trmm_ru.) - trsm: slab partitioning. (NOTE: This only affects only left-side macrokernels trsm_ll, trsm_lu; right-side macrokernels were not touched.) Also note that the previous macrokernels were preserved inside of the 'other' directory of each operation family directory (e.g. frame/3/gemm/other, frame/3/herk/other, etc). - Updated gemm macrokernel in sandbox/ref99 in light of above changes and fixed a stale function pointer type in blx_gemm_int.c (gemm_voft -> gemm_var_oft). - Added standalone test drivers in test/3m4m for herk, trmm, and trsm and minor changes to test/3m4m/Makefile. - Updated the arguments and definitions of bli_*_get_next_[ab]_upanel() and bli_trmm_?_?r_my_iter() macros defined in bli_l3_thrinfo.h. - Renamed bli_thread_get_range*() APIs to bli_thread_range*().
2026-06-29 02:37:05 +00:00 · 2018-09-30 18:54:56 -05:00
parent 60b2650d74
commit ac18949a4b
43 changed files with 8562 additions and 366 deletions
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -38,24 +39,28 @@

 // gemm

-#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
-#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
+#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

 // herk

-#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
-#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
+#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

 // trmm

-#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
+
+#define bli_trmm_my_iter( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )

 // trsm

-#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+#define bli_trsm_my_iter( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )

 //
 // thrinfo_t APIs specific to level-3 operations.
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var1
 	bli_l3_prune_unref_mparts_m( a, b, c, cntl );

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var2
 	bli_l3_prune_unref_mparts_n( a, b, c, cntl );

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -268,14 +269,27 @@ void PASTEMAC(ch,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -290,7 +304,7 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -300,12 +314,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
--- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
+++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
 	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
 	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_inc = jr_num_threads; \
+	dim_t ir_inc = ir_num_threads; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
@@ -295,11 +299,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
--- a/frame/3/gemm/other/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -0,0 +1,366 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
+
+
+void bli_gemm_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+    // If 1m is being employed on a column- or row-stored matrix with a
+    // real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+#if 1
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+#endif
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/bli_herk_l_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -279,17 +280,57 @@ void PASTEMAC(ch,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). For both the
+	   rectangular and triangular regions, use contiguous assignment for the
+	   1st loop as well. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -304,7 +345,112 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd loop for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -317,12 +463,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/bli_herk_u_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* If there is a zero region to the left of where the diagonal of C
 	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero. */ \
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
 	if ( diagoffc > 0 ) \
 	{ \
 		jp       = diagoffc / NR; \
@@ -279,17 +282,57 @@ void PASTEMAC(ch,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in the
+	   2nd loop for the initial triangular region of C (if it exists). For both
+	   the rectangular and triangular regions, use contiguous assignment for the
+	   1st loop. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -304,7 +347,7 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -317,12 +360,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -402,6 +445,111 @@ void PASTEMAC(ch,varname) \
 			} \
 		} \
 	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
+	   the remaining triangular region of C. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
 }

 INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
--- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
+++ b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
@@ -0,0 +1,420 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+
+
+void bli_herk_l_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd and 1st loops. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+
--- a/frame/3/herk/other/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/other/bli_herk_l_ker_var2.c
@@ -0,0 +1,409 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+
+
+void bli_herk_l_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+
--- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
+++ b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
@@ -0,0 +1,420 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+
+
+void bli_herk_u_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero. */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd and 1st loops. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+
--- a/frame/3/herk/other/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/other/bli_herk_u_ker_var2.c
@@ -0,0 +1,409 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+
+
+void bli_herk_u_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero. */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -85,6 +86,10 @@ void bli_trmm_front
 	}

 #if 0
+	// NOTE: This case casts right-side trmm in terms of left side. This
+	// reduces the number of macrokernels exercised to two (trmm_ll and
+	// trmm_lu) but can lead to the microkernel being executed with an
+	// output matrix that is stored counter to its output preference.

 	// If A is being multiplied from the right, transpose all operands
 	// so that we can perform the computation as if A were being multiplied
@@ -98,6 +103,11 @@ void bli_trmm_front
 	}

 #else
+	// NOTE: This case computes right-side trmm natively with trmm_rl and
+	// trmm_ru macrokernels. This code path always gives us the opportunity
+	// to transpose the entire operation so that the effective storage format
+	// of the output matrix matches the microkernel's output preference.
+	// Thus, from a performance perspective, this case is preferred.

 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
       void*   c, inc_t rs_c, inc_t cs_c, \
       cntx_t* cntx, \
       rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
     ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -317,29 +318,45 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). For both the
+	   rectangular and triangular regions, use contiguous assignment for the
+	   1st loop as well. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -369,7 +386,8 @@ void PASTEMAC(ch,varname) \
 				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
 				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
 \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
 \
@@ -379,7 +397,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -433,13 +451,13 @@ void PASTEMAC(ch,varname) \
 					                        ct,  rs_ct, cs_ct, \
 					                        c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += ps_a_cur; \
 			} \
 			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
@@ -449,7 +467,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -498,17 +516,13 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += rstep_a; \
 			} \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
       void*   c, inc_t rs_c, inc_t cs_c, \
       cntx_t* cntx, \
       rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
     ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -324,29 +325,45 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). For both the
+	   rectangular and triangular regions, use contiguous assignment for the
+	   1st loop as well. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -376,7 +393,7 @@ void PASTEMAC(ch,varname) \
 				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
 				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
 \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
 \
@@ -386,7 +403,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -440,13 +457,13 @@ void PASTEMAC(ch,varname) \
 					                        ct,  rs_ct, cs_ct, \
 					                        c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += ps_a_cur; \
 			} \
 			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
@@ -456,7 +473,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -505,17 +522,13 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += rstep_a; \
 			} \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
       void*   c, inc_t rs_c, inc_t cs_c, \
       cntx_t* cntx, \
       rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
     ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -324,15 +325,151 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( istep_a, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of B (if it exists). For both the
+	   rectangular and triangular regions, use contiguous assignment for the
+	   1st loop as well. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+       the 2nd loop for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr loop but skip all but the pointer increment for iterations
+	   that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -358,7 +495,6 @@ void PASTEMAC(ch,varname) \
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
 		   and trmm3. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 		{ \
 			/* Compute the panel stride for the current diagonal-
 			   intersecting micro-panel. */ \
@@ -366,7 +502,7 @@ void PASTEMAC(ch,varname) \
 			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
 			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
 \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
+			if ( bli_trmm_my_iter( j, thread ) ) { \
 \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
@@ -375,7 +511,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
 \
 				ctype* restrict a1_i; \
 				ctype* restrict a2; \
@@ -390,7 +526,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -449,83 +585,6 @@ void PASTEMAC(ch,varname) \
 \
 			b1 += ps_b_cur; \
 		} \
-		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
-\
-			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += cstep_b; \
-		} \
 \
 		c1 += cstep_c; \
 	} \
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
       void*   c, inc_t rs_c, inc_t cs_c, \
       cntx_t* cntx, \
       rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
     ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \
 	dim_t           n_cur; \
 	dim_t           k_b0111; \
 	dim_t           off_b0111; \
-	dim_t           i, j; \
+	dim_t           i, j, jb0; \
 	inc_t           rstep_a; \
 	inc_t           cstep_b; \
 	inc_t           rstep_c, cstep_c; \
@@ -324,16 +325,58 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of A to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd loop for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr loop but skip all but the pointer increment for iterations
+	   that are not assigned to it. */ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = 0; j < n_iter_tri; ++j ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -358,7 +401,6 @@ void PASTEMAC(ch,varname) \
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
 		   and trmm3. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 		{ \
 			/* Compute the panel stride for the current diagonal-
 			   intersecting micro-panel. */ \
@@ -366,7 +408,7 @@ void PASTEMAC(ch,varname) \
 			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
 			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
 \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
+			if ( bli_trmm_my_iter( j, thread ) ) { \
 \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
@@ -375,7 +417,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
 \
 				ctype* restrict a1_i; \
 				ctype* restrict a2; \
@@ -390,7 +432,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -449,30 +491,72 @@ void PASTEMAC(ch,varname) \
 \
 			b1 += ps_b_cur; \
 		} \
-		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
 \
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use contiguous assignment of micropanels to threads in both the 2nd and
+	   1st loops the remaining triangular region of B. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
 			bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
 			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
-\
 				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
 \
 				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 				{ \
 					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -517,19 +601,12 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
 			} \
-			} \
-\
-			b1 += cstep_b; \
 		} \
-\
-		c1 += cstep_c; \
 	} \
 \
+\
+\
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
@@ -0,0 +1,519 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
+
+
+void bli_trmm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
@@ -0,0 +1,527 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
+
+
+void bli_trmm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
@@ -0,0 +1,539 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
+
+
+void bli_trmm_rl_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -0,0 +1,539 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
+
+
+void bli_trmm_ru_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var1
 	bli_l3_prune_unref_mparts_m( a, b, c, cntl );

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
      &my_start, &my_end
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var2
 	bli_l3_prune_unref_mparts_n( a, b, c, cntl );

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
      &my_start, &my_end
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -339,25 +340,38 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if( bli_trsm_my_iter( j, thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1 + (0  )*rstep_c; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -409,8 +423,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -474,8 +487,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -531,10 +543,6 @@ void PASTEMAC(ch,varname) \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -347,25 +348,38 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if( bli_trsm_my_iter( j, thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1 + (m_iter-1)*rstep_c; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( ib = 0; ib < m_iter; ++ib ) \
@@ -419,8 +433,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -484,8 +497,7 @@ void PASTEMAC(ch,varname) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -541,10 +553,6 @@ void PASTEMAC(ch,varname) \
 \
 			c11 -= rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
@@ -0,0 +1,593 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
+
+
+void bli_trsm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if( bli_trsm_my_iter( j, thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
+
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
@@ -0,0 +1,574 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
+
+
+void bli_trsm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if( bli_trsm_my_iter( j, thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
+
--- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
@@ -0,0 +1,591 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
+
+
+void bli_trsm_rl_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to A (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of A prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
+	   the current macro-kernel targets the "rl" case (right-side/lower-
+	   triangular), it becomes upper-triangular after the kernel operation
+	   is transposed so that all kernel instances are of the "left"
+	   variety (since those are the only trsm ukernels that exist). */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           k_b11; \
+	dim_t           k_b21; \
+	dim_t           off_b11; \
+	dim_t           off_b21; \
+	dim_t           i, j, jb; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKNR
+	     pd_a == NR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKMR
+	     cs_b == 1
+	     pd_b == MR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+
+	  Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
+	  swapping of values in the control tree (ie: those values used when
+	  packing). This swapping is needed since we cast right-hand trsm in
+	  terms of transposed left-hand trsm. So, if we're going to be
+	  transposing the operation, then A needs to be packed with NR and B
+	  needs to be packed with MR (remember: B is the triangular matrix in
+	  the right-hand side parameter case).
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of NR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else                                 { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it so that
+	   we can index to the correct place in C (corresponding to the
+	   part of the panel of B that was packed).
+	   NOTE: This is NOT being done to skip over "no-op" iterations,
+	   as with the trsm_lu macro-kernel. This MUST be done for correct
+	   execution because we use n (via n_iter) to compute diagonal and
+	   index offsets for backwards movement through B. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of NR. If k
+	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an NR x NR triangular solve.
+	   This adjustment of k is consistent with what happened when B was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of A. */ \
+	if ( k % NR != 0 ) k += NR - ( k % NR ); \
+\
+	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
+	   know that the underlying buffer was already allocated to have an n
+	   dimension that is a multiple of PACKNR, with the region between the
+	   last column and the next multiple of NR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_schema_a( schema_b, &aux ); \
+	bli_auxinfo_set_schema_b( schema_a, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( jb = 0; jb < n_iter; ++jb ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b11; \
+		ctype* restrict b21; \
+		ctype* restrict b2; \
+\
+		j          = n_iter - 1 - jb; \
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+		a1         = a_cast; \
+		c11        = c1 + (n_iter-1)*cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, use a
+		   special micro-kernel that performs a fused gemm and trsm.
+		   If the current panel of B resides below the diagonal, use a
+		   a regular gemm micro-kernel. Otherwise, if it is above the
+		   diagonal, it was not packed (because it is implicitly zero)
+		   and so we do nothing. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Determine the offset to and length of the panel that was packed
+			   so we can index into the corresponding location in A. */ \
+			off_b11   = bli_max( -diagoffb_j, 0 ); \
+			k_b1121   = k - off_b11; \
+			k_b11     = NR; \
+			k_b21     = k_b1121 - NR; \
+			off_b21   = off_b11 + k_b11; \
+\
+			/* Compute the addresses of the triangular block B11 and the
+			   panel B21. */ \
+			b11       = b1; \
+			/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
+			b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
+\
+			/* Compute the panel stride for the current micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the A11 block and A12 panel. */ \
+				a11  = a1 + ( off_b11 * PACKMR ) / off_scl; \
+				a12  = a1 + ( off_b21 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + ps_b_cur; \
+					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b21, \
+					  alpha1_cast, \
+					  b21, \
+					  b11, \
+					  a12, \
+					  a11, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b21, \
+					  alpha1_cast, \
+					  b21, \
+					  b11, \
+					  a12, \
+					  a11, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + cstep_b; \
+					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  alpha2_cast, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  zero, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 -= cstep_c; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
+
--- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
@@ -0,0 +1,584 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
+
+
+void bli_trsm_ru_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to A (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of A prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
+	   the current macro-kernel targets the "ru" case (right-side/upper-
+	   triangular), it becomes lower-triangular after the kernel operation
+	   is transposed so that all kernel instances are of the "left"
+	   variety (since those are the only trsm ukernels that exist). */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           k_b01; \
+	dim_t           off_b01; \
+	dim_t           off_b11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKNR
+	     pd_a == NR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKMR
+	     cs_b == 1
+	     pd_b == MR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+
+	  Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
+	  swapping of values in the control tree (ie: those values used when
+	  packing). This swapping is needed since we cast right-hand trsm in
+	  terms of transposed left-hand trsm. So, if we're going to be
+	  transposing the operation, then A needs to be packed with NR and B
+	  needs to be packed with MR (remember: B is the triangular matrix in
+	  the right-hand side parameter case).
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of NR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else                                 { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of NR. If k
+	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an NR x NR triangular solve.
+	   This adjustment of k is consistent with what happened when B was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of A. */ \
+	if ( k % NR != 0 ) k += NR - ( k % NR ); \
+\
+	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
+	   know that the underlying buffer was already allocated to have an n
+	   dimension that is a multiple of PACKNR, with the region between the
+	   last column and the next multiple of NR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_schema_a( schema_b, &aux ); \
+	bli_auxinfo_set_schema_b( schema_a, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b01; \
+		ctype* restrict b11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+		a1         = a_cast; \
+		c11        = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, use a
+		   special micro-kernel that performs a fused gemm and trsm.
+		   If the current panel of B resides above the diagonal, use a
+		   a regular gemm micro-kernel. Otherwise, if it is below the
+		   diagonal, it was not packed (because it is implicitly zero)
+		   and so we do nothing. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Determine the offset to and length of the panel that was packed
+			   so we can index into the corresponding location in A. */ \
+			off_b01   = 0; \
+			k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+			k_b01     = k_b0111 - NR; \
+			off_b11   = k_b01; \
+\
+			/* Compute the addresses of the panel B10 and the triangular
+			   block B11. */ \
+			b01       = b1; \
+			/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
+			b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
+\
+			/* Compute the panel stride for the current micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the A10 panel and A11 block. */ \
+				a10  = a1 + ( off_b01 * PACKMR ) / off_scl; \
+				a11  = a1 + ( off_b11 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + ps_b_cur; \
+					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b01, \
+					  alpha1_cast, \
+					  b01, \
+					  b11, \
+					  a10, \
+					  a11, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b01, \
+					  alpha1_cast, \
+					  b01, \
+					  b11, \
+					  a10, \
+					  a11, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + cstep_b; \
+					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  alpha2_cast, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  zero, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
+
--- a/frame/base/bli_prune.c
+++ b/frame/base/bli_prune.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
 	// matrix is empty. This is not strictly needed but rather a minor
 	// optimization, as it would prevent threads that would otherwise get
 	// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
-	// because bli_thread_get_range*() would return empty ranges, which would
+	// because bli_thread_range*() would return empty ranges, which would
 	// cause the variant's for loop from executing any iterations.
 	// NOTE: this should only ever execute if the primary object is
 	// triangular because that is the only structure type with subpartitions
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n )
 	         !bli_is_strictly_below_diag_n( diagoff, m, n ) );
 }

+static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n )
+{
+	return ( bool_t )
+	       ( bli_is_strictly_above_diag_n( diagoff, m, n ) ||
+	         bli_is_strictly_below_diag_n( diagoff, m, n ) );
+}
+
 static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n )
 {
 	return ( bool_t )
@@ -784,10 +792,14 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
 	       ( i != 0 || n_left == 0 );
 }

-static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth )
+static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
 {
 	return ( bool_t )
-	       ( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) );
+#ifdef BLIS_JRIR_INTERLEAVE
+	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
+#else
+	       ( i == end_iter - 1 );
+#endif
 }


--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -59,9 +59,35 @@ void bli_thread_finalize( void )
 {
 }

+// -----------------------------------------------------------------------------
+#if 0
+void bli_thread_range_jrir
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     )
+{
+//#ifdef BLIS_JRIR_INTERLEAVE
+#if 1
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;
+#else
+	// Use contiguous slab partitioning for jr/ir loops.
+	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
+	*inc = 1;
+#endif
+}
+#endif
 // -----------------------------------------------------------------------------

-void bli_thread_get_range_sub
+void bli_thread_range_sub
     (
       thrinfo_t* thread,
       dim_t      n,
@@ -72,6 +98,9 @@ void bli_thread_get_range_sub
     )
 {
 	dim_t      n_way      = bli_thread_n_way( thread );
+
+	if ( n_way == 1 ) { *start = 0; *end = n; return; }
+
 	dim_t      work_id    = bli_thread_work_id( thread );

 	dim_t      all_start  = 0;
@@ -202,7 +231,7 @@ void bli_thread_get_range_sub
 	}
 }

-siz_t bli_thread_get_range_l2r
+siz_t bli_thread_range_l2r
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -216,13 +245,13 @@ siz_t bli_thread_get_range_l2r
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );

-	bli_thread_get_range_sub( thr, n, bf,
-	                          FALSE, start, end );
+	bli_thread_range_sub( thr, n, bf,
+	                      FALSE, start, end );

 	return m * ( *end - *start );
 }

-siz_t bli_thread_get_range_r2l
+siz_t bli_thread_range_r2l
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -236,13 +265,13 @@ siz_t bli_thread_get_range_r2l
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );

-	bli_thread_get_range_sub( thr, n, bf,
-	                          TRUE, start, end );
+	bli_thread_range_sub( thr, n, bf,
+	                      TRUE, start, end );

 	return m * ( *end - *start );
 }

-siz_t bli_thread_get_range_t2b
+siz_t bli_thread_range_t2b
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -256,13 +285,13 @@ siz_t bli_thread_get_range_t2b
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );

-	bli_thread_get_range_sub( thr, m, bf,
-	                          FALSE, start, end );
+	bli_thread_range_sub( thr, m, bf,
+	                      FALSE, start, end );

 	return n * ( *end - *start );
 }

-siz_t bli_thread_get_range_b2t
+siz_t bli_thread_range_b2t
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -276,15 +305,15 @@ siz_t bli_thread_get_range_b2t
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );

-	bli_thread_get_range_sub( thr, m, bf,
-	                          TRUE, start, end );
+	bli_thread_range_sub( thr, m, bf,
+	                      TRUE, start, end );

 	return n * ( *end - *start );
 }

 // -----------------------------------------------------------------------------

-dim_t bli_thread_get_range_width_l
+dim_t bli_thread_range_width_l
     (
       doff_t diagoff_j,
       dim_t  m,
@@ -495,17 +524,17 @@ siz_t bli_find_area_trap_l

 // -----------------------------------------------------------------------------

-siz_t bli_thread_get_range_weighted_sub
+siz_t bli_thread_range_weighted_sub
     (
-       thrinfo_t* thread,
-       doff_t     diagoff,
-       uplo_t     uplo,
-       dim_t      m,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     j_start_thr,
-       dim_t*     j_end_thr
+       thrinfo_t* restrict thread,
+       doff_t              diagoff,
+       uplo_t              uplo,
+       dim_t               m,
+       dim_t               n,
+       dim_t               bf,
+       bool_t              handle_edge_low,
+       dim_t*     restrict j_start_thr,
+       dim_t*     restrict j_end_thr
     )
 {
 	dim_t      n_way   = bli_thread_n_way( thread );
@@ -570,7 +599,7 @@ siz_t bli_thread_get_range_weighted_sub
 			// Compute the width of the jth subpartition, taking the
 			// current diagonal offset into account, if needed.
 			width_j =
-			bli_thread_get_range_width_l
+			bli_thread_range_width_l
 			(
 			  diagoff_j, m, n_left,
 			  j, n_way,
@@ -614,7 +643,7 @@ siz_t bli_thread_get_range_weighted_sub
 		bli_toggle_bool( &handle_edge_low );

 		// Compute the appropriate range for the rotated trapezoid.
-		area = bli_thread_get_range_weighted_sub
+		area = bli_thread_range_weighted_sub
 		(
 		  thread, diagoff, uplo, m, n, bf,
 		  handle_edge_low,
@@ -632,7 +661,7 @@ siz_t bli_thread_get_range_weighted_sub
 	return area;
 }

-siz_t bli_thread_get_range_mdim
+siz_t bli_thread_range_mdim
     (
       dir_t      direct,
       thrinfo_t* thr,
@@ -678,20 +707,20 @@ siz_t bli_thread_get_range_mdim
 	if ( use_weighted )
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
 	}
 	else
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_t2b( thr, x, bmult, start, end );
+			return bli_thread_range_t2b( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_b2t( thr, x, bmult, start, end );
+			return bli_thread_range_b2t( thr, x, bmult, start, end );
 	}
 }

-siz_t bli_thread_get_range_ndim
+siz_t bli_thread_range_ndim
     (
       dir_t      direct,
       thrinfo_t* thr,
@@ -737,20 +766,20 @@ siz_t bli_thread_get_range_ndim
 	if ( use_weighted )
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
 	}
 	else
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_l2r( thr, x, bmult, start, end );
+			return bli_thread_range_l2r( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_r2l( thr, x, bmult, start, end );
+			return bli_thread_range_r2l( thr, x, bmult, start, end );
 	}
 }

-siz_t bli_thread_get_range_weighted_l2r
+siz_t bli_thread_range_weighted_l2r
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -782,7 +811,7 @@ siz_t bli_thread_get_range_weighted_l2r
 		}

 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  FALSE, start, end
@@ -790,7 +819,7 @@ siz_t bli_thread_get_range_weighted_l2r
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_l2r
+		area = bli_thread_range_l2r
 		(
 		  thr, a, bmult,
 		  start, end
@@ -800,7 +829,7 @@ siz_t bli_thread_get_range_weighted_l2r
 	return area;
 }

-siz_t bli_thread_get_range_weighted_r2l
+siz_t bli_thread_range_weighted_r2l
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -834,7 +863,7 @@ siz_t bli_thread_get_range_weighted_r2l
 		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );

 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  TRUE, start, end
@@ -842,7 +871,7 @@ siz_t bli_thread_get_range_weighted_r2l
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_r2l
+		area = bli_thread_range_r2l
 		(
 		  thr, a, bmult,
 		  start, end
@@ -852,7 +881,7 @@ siz_t bli_thread_get_range_weighted_r2l
 	return area;
 }

-siz_t bli_thread_get_range_weighted_t2b
+siz_t bli_thread_range_weighted_t2b
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -886,7 +915,7 @@ siz_t bli_thread_get_range_weighted_t2b
 		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );

 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  FALSE, start, end
@@ -894,7 +923,7 @@ siz_t bli_thread_get_range_weighted_t2b
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_t2b
+		area = bli_thread_range_t2b
 		(
 		  thr, a, bmult,
 		  start, end
@@ -904,7 +933,7 @@ siz_t bli_thread_get_range_weighted_t2b
 	return area;
 }

-siz_t bli_thread_get_range_weighted_b2t
+siz_t bli_thread_range_weighted_b2t
     (
       thrinfo_t* thr,
       obj_t*     a,
@@ -939,7 +968,7 @@ siz_t bli_thread_get_range_weighted_b2t

 		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );

-		area = bli_thread_get_range_weighted_sub
+		area = bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  TRUE, start, end
@@ -947,7 +976,7 @@ siz_t bli_thread_get_range_weighted_b2t
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_b2t
+		area = bli_thread_range_b2t
 		(
 		  thr, a, bmult,
 		  start, end
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -6,6 +6,7 @@

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -56,7 +57,21 @@ void bli_thread_finalize( void );
 #endif

 // Thread range-related prototypes.
-void bli_thread_get_range_sub
+#if 0
+void bli_thread_range_jrir
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     );
+#endif
+// -----------------------------------------------------------------------------
+
+void bli_thread_range_sub
     (
       thrinfo_t* thread,
       dim_t      n,
@@ -82,8 +97,8 @@ siz_t PASTEMAC0( opname ) \
       dim_t*     end  \
     );

-GENPROT( thread_get_range_mdim )
-GENPROT( thread_get_range_ndim )
+GENPROT( thread_range_mdim )
+GENPROT( thread_range_ndim )

 #undef  GENPROT
 #define GENPROT( opname ) \
@@ -97,18 +112,18 @@ siz_t PASTEMAC0( opname ) \
       dim_t*     end  \
     );

-GENPROT( thread_get_range_l2r )
-GENPROT( thread_get_range_r2l )
-GENPROT( thread_get_range_t2b )
-GENPROT( thread_get_range_b2t )
+GENPROT( thread_range_l2r )
+GENPROT( thread_range_r2l )
+GENPROT( thread_range_t2b )
+GENPROT( thread_range_b2t )

-GENPROT( thread_get_range_weighted_l2r )
-GENPROT( thread_get_range_weighted_r2l )
-GENPROT( thread_get_range_weighted_t2b )
-GENPROT( thread_get_range_weighted_b2t )
+GENPROT( thread_range_weighted_l2r )
+GENPROT( thread_range_weighted_r2l )
+GENPROT( thread_range_weighted_t2b )
+GENPROT( thread_range_weighted_b2t )


-dim_t bli_thread_get_range_width_l
+dim_t bli_thread_range_width_l
     (
       doff_t diagoff_j,
       dim_t  m,
@@ -126,17 +141,17 @@ siz_t bli_find_area_trap_l
       dim_t  n,
       doff_t diagoff
     );
-siz_t bli_thread_get_range_weighted_sub
+siz_t bli_thread_range_weighted_sub
     (
-       thrinfo_t* thread,
-       doff_t     diagoff,
-       uplo_t     uplo,
-       dim_t      m,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     j_start_thr,
-       dim_t*     j_end_thr
+       thrinfo_t* restrict thread,
+       doff_t              diagoff,
+       uplo_t              uplo,
+       dim_t               m,
+       dim_t               n,
+       dim_t               bf,
+       bool_t              handle_edge_low,
+       dim_t*     restrict j_start_thr,
+       dim_t*     restrict j_end_thr
     );


@@ -215,5 +230,112 @@ void  bli_thread_init_rntm( rntm_t* rntm );

 void  bli_thread_init_rntm_from_env( rntm_t* rntm );

+// -----------------------------------------------------------------------------
+
+//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc );
+
+static void bli_thread_range_jrir_rr
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     )
+{
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;
+}
+
+static void bli_thread_range_jrir_sl
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     )
+{
+	// Use contiguous slab partitioning of jr/ir loops.
+	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
+	*inc = 1;
+}
+
+static void bli_thread_range_jrir
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     )
+{
+//#ifdef BLIS_JRIR_INTERLEAVE
+#if 0
+	bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
+#else
+	bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
+#endif
+}
+
+static void bli_thread_range_weighted_jrir
+     (
+       thrinfo_t* thread,
+       doff_t     diagoff,
+       uplo_t     uplo,
+       dim_t      m,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+	   dim_t*     inc
+     )
+{
+#ifdef BLIS_JRIR_INTERLEAVE
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;
+#else
+	// Use contiguous slab partitioning for jr/ir loops.
+	bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
+	                                   handle_edge_low, start, end );
+
+	*start = *start / bf; *inc = 1;
+
+	if ( *end % bf ) *end = *end / bf + 1;
+	else             *end = *end / bf;
+
+#endif
+
+#if 0
+	const dim_t n_way = bli_thread_n_way( thread );
+
+	if ( m * n / n_way > 25000 )
+	{
+		// Use contiguous slab partitioning for jr/ir loops.
+		bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
+		                                   handle_edge_low, start, end );
+		*inc = 1;
+	}
+	else
+	{
+		// Use interleaved partitioning of jr/ir loops.
+		*start = bli_thread_work_id( thread );
+		*inc   = n_way; //bli_thread_n_way( thread );
+		*end   = n;
+	}
+#endif
+}
+
 #endif

--- a/sandbox/ref99/blx_gemm_int.c
+++ b/sandbox/ref99/blx_gemm_int.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -46,10 +47,10 @@ void blx_gemm_int
       thrinfo_t* thread
     )
 {
-	obj_t     a_local;
-	obj_t     b_local;
-	obj_t     c_local;
-	gemm_voft f;
+	obj_t        a_local;
+	obj_t        b_local;
+	obj_t        c_local;
+	gemm_var_oft f;

 	// Alias A, B, and C in case we need to update attached scalars.
 	bli_obj_alias_to( a, &a_local );
--- a/sandbox/ref99/vars/blx_gemm_blk_var1.c
+++ b/sandbox/ref99/vars/blx_gemm_blk_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var1
 	dim_t my_start, my_end;

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  BLIS_FWD, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
--- a/sandbox/ref99/vars/blx_gemm_blk_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_blk_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var2
 	dim_t my_start, my_end;

 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  BLIS_FWD, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
--- a/sandbox/ref99/vars/blx_gemm_ker_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -255,14 +256,27 @@ void PASTECH2(blx_,ch,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -277,7 +291,7 @@ void PASTECH2(blx_,ch,varname) \
 		b2 = b1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -287,12 +301,12 @@ void PASTECH2(blx_,ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
--- a/test/3m4m/Makefile
+++ b/test/3m4m/Makefile
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -200,13 +201,13 @@ STR_ST   := -DTHR_STR=\"st\"
 STR_MT   := -DTHR_STR=\"mt\"

 # Problem size specification
-PDEF_ST  := -DP_BEGIN=40 \
+PDEF_ST  := -DP_BEGIN=96 \
            -DP_END=2000 \
-            -DP_INC=40
+            -DP_INC=96

-PDEF_MT  := -DP_BEGIN=200 \
-            -DP_END=10000 \
-            -DP_INC=200
+PDEF_MT  := -DP_BEGIN=192 \
+            -DP_END=3000 \
+            -DP_INC=192



@@ -226,9 +227,6 @@ all-mt:       blis-mt openblas-mt mkl-mt
 blis-st:      blis-gemm-st
 blis-mt:      blis-gemm-mt

-blis-nat-st:  blis-gemm-nat-st
-blis-nat-mt:  blis-gemm-nat-mt
-
 openblas-st:  openblas-gemm-st
 openblas-mt:  openblas-gemm-mt

@@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \
 blis-gemm-mt: blis-gemm-nat-mt \
              blis-gemm-ind-mt

+blis-nat-st: \
+      test_sgemm_asm_blis_st.x \
+      test_dgemm_asm_blis_st.x \
+      test_cgemm_asm_blis_st.x \
+      test_zgemm_asm_blis_st.x \
+      test_sherk_asm_blis_st.x \
+      test_dherk_asm_blis_st.x \
+      test_cherk_asm_blis_st.x \
+      test_zherk_asm_blis_st.x \
+      test_strmm_asm_blis_st.x \
+      test_dtrmm_asm_blis_st.x \
+      test_ctrmm_asm_blis_st.x \
+      test_ztrmm_asm_blis_st.x \
+      test_strsm_asm_blis_st.x \
+      test_dtrsm_asm_blis_st.x \
+      test_ctrsm_asm_blis_st.x \
+      test_ztrsm_asm_blis_st.x
+
+blis-nat-mt: \
+      test_sgemm_asm_blis_mt.x \
+      test_dgemm_asm_blis_mt.x \
+      test_cgemm_asm_blis_mt.x \
+      test_zgemm_asm_blis_mt.x \
+      test_sherk_asm_blis_mt.x \
+      test_dherk_asm_blis_mt.x \
+      test_cherk_asm_blis_mt.x \
+      test_zherk_asm_blis_mt.x \
+      test_strmm_asm_blis_mt.x \
+      test_dtrmm_asm_blis_mt.x \
+      test_ctrmm_asm_blis_mt.x \
+      test_ztrmm_asm_blis_mt.x \
+      test_strsm_asm_blis_mt.x \
+      test_dtrsm_asm_blis_mt.x \
+      test_ctrsm_asm_blis_mt.x \
+      test_ztrsm_asm_blis_mt.x
+
 blis-gemm-nat-st: \
      test_sgemm_asm_blis_st.x \
      test_dgemm_asm_blis_st.x \
@@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@

 # blis asm
-test_d%_asm_blis_st.o: test_%.c
+test_d%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@

-test_s%_asm_blis_st.o: test_%.c
+test_s%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@

-test_z%_asm_blis_st.o: test_%.c
+test_z%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@

-test_c%_asm_blis_st.o: test_%.c
+test_c%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@

-test_d%_asm_blis_mt.o: test_%.c
+test_d%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@

-test_s%_asm_blis_mt.o: test_%.c
+test_s%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@

-test_z%_asm_blis_mt.o: test_%.c
+test_z%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@

-test_c%_asm_blis_mt.o: test_%.c
+test_c%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@

 # openblas
--- a/test/3m4m/test_herk.c
+++ b/test/3m4m/test_herk.c
@@ -0,0 +1,314 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c;
+	obj_t    c_save;
+	obj_t    alpha, beta;
+	dim_t    m, k;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, k_input;
+	ind_t    ind;
+	num_t    dt, dt_real;
+	char     dt_ch;
+	int      r, n_repeats;
+	uplo_t   uploc;
+	trans_t  transa;
+	f77_char f77_uploc;
+	f77_char f77_transa;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+	dt_real = bli_dt_proj_to_real( DT );
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	k_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+	uploc  = BLIS_LOWER;
+	transa = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%cherk_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt_real, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt,      1, 1, 0, 0, &beta );
+
+		if ( bli_does_trans( transa ) )
+			bli_obj_create( dt, k, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, m, m, 0, 0, &c );
+		//bli_obj_create( dt, m, k, 2, 2*m, &a );
+		//bli_obj_create( dt, k, n, 2, 2*k, &b );
+		//bli_obj_create( dt, m, n, 2, 2*m, &c );
+		bli_obj_create( dt, m, m, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_HERMITIAN, &c );
+		bli_obj_set_uplo( uploc, &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &beta );
+
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_herk( &alpha,
+			          &a,
+			          &beta,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = bli_obj_buffer( &alpha );
+			float*   ap     = bli_obj_buffer( &a );
+			float*   betap  = bli_obj_buffer( &beta );
+			float*   cp     = bli_obj_buffer( &c );
+
+			ssyrk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+			double*  betap  = bli_obj_buffer( &beta );
+			double*  cp     = bli_obj_buffer( &c );
+
+			dsyrk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width_after_trans( &a );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			float*     alphap = bli_obj_buffer( &alpha );
+			scomplex*  ap     = bli_obj_buffer( &a );
+			scomplex*  betap  = bli_obj_buffer( &beta );
+			scomplex*  cp     = bli_obj_buffer( &c );
+
+			cherk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width_after_trans( &a );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			double*    alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  betap  = bli_obj_buffer( &beta );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			zherk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%cherk_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k, gflops );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/3m4m/test_trmm.c
+++ b/test/3m4m/test_trmm.c
@@ -0,0 +1,328 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c;
+	obj_t    c_save;
+	obj_t    alpha;
+	dim_t    m, n;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, n_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	side_t   side;
+	uplo_t   uploa;
+	trans_t  transa;
+	diag_t   diaga;
+	f77_char f77_side;
+	f77_char f77_uploa;
+	f77_char f77_transa;
+	f77_char f77_diaga;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+#if 0
+	side   = BLIS_LEFT;
+#else
+	side   = BLIS_RIGHT;
+#endif
+#if 0
+	uploa  = BLIS_LOWER;
+#else
+	uploa  = BLIS_UPPER;
+#endif
+	transa = BLIS_NO_TRANSPOSE;
+	diaga  = BLIS_NONUNIT_DIAG;
+
+	bli_param_map_blis_to_netlib_side( side, &f77_side );
+	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%ctrmm_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &a );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
+		bli_obj_set_uplo( uploa, &a );
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_diag( diaga, &a );
+
+		bli_randm( &a );
+		bli_mktrim( &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_trmm( side,
+			          &alpha,
+			          &a,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			float*    alphap = bli_obj_buffer( &alpha );
+			float*    ap     = bli_obj_buffer( &a );
+			float*    cp     = bli_obj_buffer( &c );
+
+			strmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			double*   alphap = bli_obj_buffer( &alpha );
+			double*   ap     = bli_obj_buffer( &a );
+			double*   cp     = bli_obj_buffer( &c );
+
+			dtrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* cp     = bli_obj_buffer( &c );
+
+			ctrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width( &c );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			ztrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		if ( bli_is_left( side ) )
+			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
+		else
+			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%ctrmm_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/3m4m/test_trsm.c
+++ b/test/3m4m/test_trsm.c
@@ -0,0 +1,338 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c, d;
+	obj_t    c_save;
+	obj_t    alpha;
+	dim_t    m, n;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, n_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	side_t   side;
+	uplo_t   uploa;
+	trans_t  transa;
+	diag_t   diaga;
+	f77_char f77_side;
+	f77_char f77_uploa;
+	f77_char f77_transa;
+	f77_char f77_diaga;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+#if 0
+	side   = BLIS_LEFT;
+#else
+	side   = BLIS_RIGHT;
+#endif
+#if 0
+	uploa  = BLIS_LOWER;
+#else
+	uploa  = BLIS_UPPER;
+#endif
+	transa = BLIS_NO_TRANSPOSE;
+	diaga  = BLIS_NONUNIT_DIAG;
+
+	bli_param_map_blis_to_netlib_side( side, &f77_side );
+	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%ctrsm_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &a );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		//bli_obj_create( dt, m, n, n, 1, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &d );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &d );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
+		bli_obj_set_uplo( uploa, &a );
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_diag( diaga, &a );
+
+		bli_randm( &a );
+		bli_mktrim( &a );
+
+		bli_setd( &BLIS_TWO, &d );
+		bli_addd( &d, &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_trsm( side,
+			          &alpha,
+			          &a,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			float*    alphap = bli_obj_buffer( &alpha );
+			float*    ap     = bli_obj_buffer( &a );
+			float*    cp     = bli_obj_buffer( &c );
+
+			strsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			double*   alphap = bli_obj_buffer( &alpha );
+			double*   ap     = bli_obj_buffer( &a );
+			double*   cp     = bli_obj_buffer( &c );
+
+			dtrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* cp     = bli_obj_buffer( &c );
+
+			ctrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width( &c );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			ztrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		if ( bli_is_left( side ) )
+			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
+		else
+			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%ctrsm_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+		bli_obj_free( &d );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/thread_ranges/test_ranges.c
+++ b/test/thread_ranges/test_ranges.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -290,13 +291,13 @@ int main( int argc, char** argv )
 			thrinfo.work_id = t;

 			if      ( part_n_dim && go_fwd )
-				area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
 			else if ( part_n_dim && go_bwd )
-				area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
 			else if ( part_m_dim && go_fwd )
-				area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
 			else // ( part_m_dim && go_bwd )
-				area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );

 			width = end - start;

--- a/windows/build/libblis-symbols.def
+++ b/windows/build/libblis-symbols.def
@@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt
 bli_thread_get_jr_nt
 bli_thread_get_num_threads
 bli_thread_get_pc_nt
-bli_thread_get_range_b2t
-bli_thread_get_range_l2r
-bli_thread_get_range_mdim
-bli_thread_get_range_ndim
-bli_thread_get_range_r2l
-bli_thread_get_range_sub
-bli_thread_get_range_t2b
-bli_thread_get_range_weighted_b2t
-bli_thread_get_range_weighted_l2r
-bli_thread_get_range_weighted_r2l
-bli_thread_get_range_weighted_sub
-bli_thread_get_range_weighted_t2b
-bli_thread_get_range_width_l
+bli_thread_range_b2t
+bli_thread_range_l2r
+bli_thread_range_mdim
+bli_thread_range_ndim
+bli_thread_range_r2l
+bli_thread_range_sub
+bli_thread_range_t2b
+bli_thread_range_weighted_b2t
+bli_thread_range_weighted_l2r
+bli_thread_range_weighted_r2l
+bli_thread_range_weighted_sub
+bli_thread_range_weighted_t2b
+bli_thread_range_width_l
 bli_thread_init
 bli_thread_init_rntm
 bli_thread_init_rntm_from_env