Added option of slab or rr partitioning in jr/ir.

Details: - Updated existing macrokernel function names and definitions to explicitly use slab assignment of micropanels to threads, then created duplicate versions of macrokernels that explicitly use round-robin assignment instead of slab. NOTE: As in ac18949, trsm_r macrokernels were not substantially updated in this commit because they are currently disabled in bli_trsm_front.c. - Updated existing packing function (in blk_packm_blk_var1.c) to explicitly use slab partitioning, and then duplicated for round-robin. - Updated control tree initialization to use the appropriate macrokernel and packm function pointers depending on which method (slab or rr) was enabled at configure-time. - Updated configure script to accept new --thread-part-jrir=[slab|rr] option (-m [slab|rr] for short), which allows the user to explicitly request either slab or round-robin assignment (partitioning) of micropanels to threads. - Updated sandbox/ref99 according to above changes. - Minor updates to build/add-copyright.py.
2026-06-29 02:37:05 +00:00 · 2018-10-07 20:30:32 -05:00
parent 98e01ea04b
commit c92762ecdc
48 changed files with 7477 additions and 1394 deletions
--- a/build/add-copyright.py
+++ b/build/add-copyright.py
@@ -187,6 +187,8 @@ def main():
 		else:
 			filename = git_words[1]

+		#my_echo( "-debug---- %s" % filename )
+
 		# Start by opening the file. (We can assume it exists since it
 		# was found by 'git status', so no need to check for existence.)
 		# Read all lines in the file and then close it.
@@ -203,7 +205,7 @@ def main():
 		# If the file does not have any copyright notice in it already, we
 		# assume we don't need to update it.
 		if not has_cr:
-			my_echo( "[skipped] %s" % filename )
+			my_echo( "[nocrline] %s" % filename )
 			continue

 		# Check whether the file already has a copyright for the_org. We may
@@ -214,7 +216,7 @@ def main():
 		mod_file_lines = []

 		# At this point we know that the file has at least one copyright, and
-		# has_org_cr encodes whether already has a copyright for the_org.
+		# has_org_cr encodes whether it already has a copyright for the_org.

 		# We process the files that we know already have copyrights for the_org
 		# differently from the files that do not yet have them.
@@ -240,12 +242,15 @@ def main():
 						repl_line = ' %s, ' % cur_year
 						line_ny = re.sub( find_line, repl_line, line )

-						my_echo( "[updated] %s" % filename )
+						my_echo( "[updated ] %s" % filename )

 						# Add the updated line to the running list.
 						mod_file_lines += line_ny

 					else:
+
+						my_echo( "[up2date ] %s" % filename )
+
 						# Add the unchanged line to the running list.
 						mod_file_lines += line
 							
@@ -262,7 +267,7 @@ def main():
 			# Don't go any further if we're only updating existing copyright
 			# lines.
 			if update_only:
-				my_echo( "[skipped] %s" % filename )
+				my_echo( "[nocrline] %s" % filename )
 				continue

 			num_file_lines = len( file_lines )
@@ -313,7 +318,7 @@ def main():
 						mod_file_lines += line
 						mod_file_lines += line_nyno

-						my_echo( "[added  ] %s" % filename )
+						my_echo( "[added   ] %s" % filename )

 					# endif resnext

--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,6 +53,14 @@
 #define BLIS_ENABLE_PTHREADS
 #endif

+#if @enable_jrir_slab@
+#define BLIS_ENABLE_JRIR_SLAB
+#endif
+
+#if @enable_jrir_rr@
+#define BLIS_ENABLE_JRIR_RR
+#endif
+
 #if @enable_packbuf_pools@
 #define BLIS_ENABLE_PACKBUF_POOLS
 #endif
--- a/57
+++ b/57
@@ -163,9 +163,6 @@ print_usage()
 	echo "                 incur additional overhead in some (but not all)"
 	echo "                 situations."
 	echo " "
-	echo "   -q, --quiet   Suppress informational output. By default, configure"
-	echo "                 is verbose. (NOTE: -q is not yet implemented)"
-	echo " "
 	echo "   -i SIZE, --int-size=SIZE"
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
@@ -212,6 +209,19 @@ print_usage()
 	echo "                 detects the presence of libmemkind, libmemkind is used"
 	echo "                 by default, and otherwise it is not used by default."
 	echo " "
+	echo "   --thread-part-jrir=METHOD"
+	echo " "
+	echo "                 Request a method of assigning micropanels to threads in"
+	echo "                 the JR and IR loops. Valid options are 'slab' and 'rr'."
+	echo "                 Using 'slab' assigns (as much as possible) contiguous"
+	echo "                 regions of micropanels to each thread while the latter"
+	echo "                 assigns micropanels to threads in a round-robin fashion."
+	echo "                 (NOTE: Specifying this option constitutes a *request*,"
+	echo "                 which may be ignored in certain situations if the"
+	echo "                 implementation has a good reason to do so.) The chosen"
+	echo "                 method also applies during the packing of A and B. The"
+	echo "                 default method and B. The default method is 'slab'."
+	echo " "
 	echo "   --force-version=STRING"
 	echo " "
 	echo "                 Force configure to use an arbitrary version string"
@@ -226,6 +236,9 @@ print_usage()
 	echo "                 a sanity check to make sure these lists are constituted"
 	echo "                 as expected."
 	echo " "
+	echo "   -q, --quiet   Suppress informational output. By default, configure"
+	echo "                 is verbose. (NOTE: -q is not yet implemented)"
+	echo " "
 	echo "   -h, --help    Output this information and quit."
 	echo " "
 	echo " Environment Variables:"
@@ -1591,6 +1604,9 @@ main()
 	# The threading flag.
 	threading_model='no'

+	# The method of assigning micropanels to threads in the JR and JR loops.
+	thread_part_jrir='slab'
+
 	# Option variables.
 	quiet_flag=''
 	show_config_list=''
@@ -1641,7 +1657,7 @@ main()
 	# -- Command line option/argument parsing ----------------------------------

 	# Process our command line options.
-	while getopts ":hp:d:s:t:qci:b:-:" opt; do
+	while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
 		case $opt in
 			-)
 				case "$OPTARG" in
@@ -1705,6 +1721,9 @@ main()
 					enable-threading=*)
 						threading_model=${OPTARG#*=}
 						;;
+					thread-part-jrir=*)
+						thread_part_jrir=${OPTARG#*=}
+						;;
 					disable-threading)
 						threading_model='no'
 						;;
@@ -1776,6 +1795,9 @@ main()
 			t)
 				threading_model=$OPTARG
 				;;
+			r)
+				thread_part_jrir=$OPTARG
+				;;
 			i)
 				int_type_size=$OPTARG
 				;;
@@ -1791,7 +1813,7 @@ main()
 		esac
 	done
 	shift $(($OPTIND - 1))
-	
+
 	# Parse environment variables
 	while [ $# -gt 0 ]; do
 		case $1 in
@@ -2351,7 +2373,7 @@ main()
 	elif [ "x${threading_model}" = "xpthreads" ] ||
 	     [ "x${threading_model}" = "xpthread" ] ||
 	     [ "x${threading_model}" = "xposix" ]; then
-		echo "${script_name}: using Pthreads for threading."
+		echo "${script_name}: using POSIX threads for threading."
 		enable_pthreads='yes'
 		enable_pthreads_01=1
 		threading_model="pthreads" # Standardize the value.
@@ -2362,7 +2384,22 @@ main()
 		echo "${script_name}: *** Unsupported threading model: ${threading_model}."
 		exit 1
 	fi
-	
+
+	# Check the method of assigning micropanels to threads in the JR and IR
+	# loops.
+	enable_jrir_slab_01=0
+	enable_jrir_rr_01=0
+	if [ "x${thread_part_jrir}" = "xslab" ]; then
+		echo "${script_name}: requesting slab threading in jr and ir loops."
+		enable_jrir_slab_01=1
+	elif [ "x${thread_part_jrir}" = "xrr" ]; then
+		echo "${script_name}: requesting round-robin threading in jr and ir loops."
+		enable_jrir_rr_01=1
+	else
+		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}."
+		exit 1
+	fi
+
 	# Convert 'yes' and 'no' flags to booleans.
 	if [ "x${enable_packbuf_pools}" = "xyes" ]; then
 		echo "${script_name}: internal memory pools for packing buffers are enabled."
@@ -2414,7 +2451,7 @@ main()
 		echo "${script_name}: the CBLAS compatibility layer is disabled."
 		enable_cblas_01=0
 	fi
-	
+
 	# Report integer sizes
 	if [ "x${int_type_size}" = "x32" ]; then
 		echo "${script_name}: the internal integer size is 32-bit."
@@ -2590,6 +2627,8 @@ main()
 		| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
 		| sed   -e "s/@enable_openmp@/${enable_openmp_01}/g" \
 		| sed   -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
+		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
+		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
 		| sed   -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \
 		| sed   -e "s/@int_type_size@/${int_type_size}/g" \
 		| sed   -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
@@ -2681,7 +2720,7 @@ main()


 	# -- Mirror source directory hierarchies to object directories -------------
-	
+
 	# Combine the config_list with the config_name and then remove duplicates.
 	config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}")

--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -39,9 +40,7 @@

 #include "bli_packm_part.h"

-#include "bli_packm_unb_var1.h"
-
-#include "bli_packm_blk_var1.h"
+#include "bli_packm_var.h"

 #include "bli_packm_struc_cxk.h"
 #include "bli_packm_struc_cxk_4mi.h"
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -34,71 +35,6 @@

 #include "blis.h"

-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           pack_t  schema,
-                           bool_t  invdiag,
-                           bool_t  revifup,
-                           bool_t  reviflo,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      inc_t is_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   packm_ker,
-                           cntx_t* cntx,
-                           thrinfo_t* thread
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
-
-
-static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
-{
-                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
-// 0000 row/col panels
-               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
-                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
-// 0001 row/col panels: 4m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_4mi,
-                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
-// 0010 row/col panels: 3m interleaved
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
-               { { NULL,                      NULL,
-                   NULL,                      NULL,                      } },
-// 0100 row/col panels: 3m separated
-               { { NULL,                      bli_cpackm_struc_cxk_3mis,
-                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
-// 0101 row/col panels: real only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0110 row/col panels: imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 0111 row/col panels: real+imaginary only
-               { { NULL,                      bli_cpackm_struc_cxk_rih,
-                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
-// 1000 row/col panels: 1m-expanded (1e)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-// 1001 row/col panels: 1m-reordered (1r)
-               { { NULL,                      bli_cpackm_struc_cxk_1er,
-                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
-};
-
-
 void bli_packm_blk_var1
     (
       obj_t*   c,
@@ -108,619 +44,14 @@ void bli_packm_blk_var1
       thrinfo_t* t
     )
 {
-	num_t     dt_cp      = bli_obj_dt( c );
+#ifdef BLIS_ENABLE_JRIR_SLAB

-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
-	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
+	bli_packm_blk_var1sl( c, p, cntx, cntl, t );

-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
+#else // BLIS_ENABLE_JRIR_RR

-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
+	bli_packm_blk_var1rr( c, p, cntx, cntl, t );

-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	obj_t     kappa;
-	obj_t*    kappa_p;
-	void*     buf_kappa;
-
-	func_t*   packm_kers;
-	void*     packm_ker;
-
-	FUNCPTR_T f;
-
-
-	// Treatment of kappa (ie: packing during scaling) depends on
-	// whether we are executing an induced method.
-	if ( bli_is_nat_packed( schema ) )
-	{
-		// This branch is for native execution, where we assume that
-		// the micro-kernel will always apply the alpha scalar of the
-		// higher-level operation. Thus, we use BLIS_ONE for kappa so
-		// that the underlying packm implementation does not perform
-		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-	}
-	else // if ( bli_is_ind_packed( schema ) )
-	{
-		// The value for kappa we use will depend on whether the scalar
-		// attached to A has a nonzero imaginary component. If it does,
-		// then we will apply the scalar during packing to facilitate
-		// implementing induced complex domain algorithms in terms of
-		// real domain micro-kernels. (In the aforementioned situation,
-		// applying a real scalar is easy, but applying a complex one is
-		// harder, so we avoid the need altogether with the code below.)
-		if ( bli_obj_scalar_has_nonzero_imag( p ) )
-		{
-			//printf( "applying non-zero imag kappa\n" );
-
-			// Detach the scalar.
-			bli_obj_scalar_detach( p, &kappa );
-
-			// Reset the attached scalar (to 1.0).
-			bli_obj_scalar_reset( p );
-
-			kappa_p = &kappa;
-		}
-		else
-		{
-			// If the internal scalar of A has only a real component, then
-			// we will apply it later (in the micro-kernel), and so we will
-			// use BLIS_ONE to indicate no scaling during packing.
-			kappa_p = &BLIS_ONE;
-		}
-	
-		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
-	}
-
-
-	// Choose the correct func_t object based on the pack_t schema.
-#if 0
-	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
-	else if ( bli_is_3mi_packed( schema ) ||
-	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
-	else if ( bli_is_ro_packed( schema ) ||
-	          bli_is_io_packed( schema ) ||
-	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
-	else                                    packm_kers = packm_struc_cxk_kers;
-#else
-	// The original idea here was to read the packm_ukr from the context
-	// if it is non-NULL. The problem is, it requires that we be able to
-	// assume that the packm_ukr field is initialized to NULL, which it
-	// currently is not.
-
-	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
-
-	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// NULL (which is the default value after the context is created),
-		// we use the default lookup table to determine the right func_t
-		// for the current schema.
-		const dim_t i = bli_pack_schema_index( schema );
-
-		packm_kers = &packm_struc_cxk_kers[ i ];
-	}
-#if 0
-	else // cntx's packm func_t overrides
-	{
-		// If the packm structure-aware kernel func_t in the context is
-		// non-NULL (ie: assumed to be valid), we use that instead.
-		//packm_kers = bli_cntx_packm_ukrs( cntx );
-		packm_kers = cntx_packm_kers;
-	}
 #endif
-#endif
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   cntx,
-	   t );
 }

-
-#undef  GENTFUNCR
-#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t strucc, \
-       doff_t  diagoffc, \
-       diag_t  diagc, \
-       uplo_t  uploc, \
-       trans_t transc, \
-       pack_t  schema, \
-       bool_t  invdiag, \
-       bool_t  revifup, \
-       bool_t  reviflo, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       void*   kappa, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       void*   p, inc_t rs_p, inc_t cs_p, \
-                  inc_t is_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       void*   packm_ker, \
-       cntx_t* cntx, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           num_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool_t          row_stored; \
-	bool_t          col_stored; \
-	inc_t           is_p_use; \
-	dim_t           ss_num; \
-	dim_t           ss_den; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the storage stride scaling. Usually this is just 1. However,
-	   in the case of interleaved 3m, we need to scale by 3/2, and in the
-	   cases of real-only, imag-only, or summed-only, we need to scale by
-	   1/2. In both cases, we are compensating for the fact that pointer
-	   arithmetic occurs in terms of complex elements rather than real
-	   elements. */ \
-	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
-	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
-	else                                    { ss_num = 1; ss_den = 1; } \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (num_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = num_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-*/ \
-\
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < num_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			/* We need to re-compute the imaginary stride as a function of
-			   panel_len_max_i since triangular packed matrices have panels
-			   of varying lengths. NOTE: This imaginary stride value is
-			   only referenced by the packm kernels for induced methods. */ \
-			is_p_use  = ldp * panel_len_max_i; \
-\
-			/* We nudge the imaginary stride up by one if it is odd. */ \
-			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = ( is_p_use * ss_num ) / ss_den; \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			c_use = c_begin; \
-			p_use = p_begin; \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			is_p_use = is_p; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-			                           is_p_use, \
-				                cntx ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			p_inc = ps_p; \
-		} \
-\
-/*
-if ( col_stored ) { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-else { \
-	if ( bli_thread_work_id( thread ) == 0 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-	if ( bli_thread_work_id( thread ) == 1 ) \
-	{ \
-	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
-	fflush( stdout ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
-	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
-	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
-	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-	fflush( stdout ); \
-	} \
-bli_thread_obarrier( thread ); \
-} \
-*/ \
-\
-/*
-		if ( bli_is_4mi_packed( schema ) ) { \
-		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
-		if ( col_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		if ( row_stored ) { \
-		if ( 0 ) \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
-		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-		} \
-*/ \
-/*
-*/ \
-\
-/*
-*/ \
-/*
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-*/ \
-\
-\
-/*
-		if ( row_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		inc_t is_b = rs_p * *m_panel_max; \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/ \
-\
-\
-/*
-		if ( col_stored ) { \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
-		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
-		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
-		} \
-*/ \
-\
-		p_begin += p_inc; \
-\
-	} \
-}
-
-INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
-
--- a/frame/1m/packm/bli_packm_blk_var1.c.old
+++ b/frame/1m/packm/bli_packm_blk_var1.c.old
@@ -1,463 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           pack_t  schema,
-                           bool_t  invdiag,
-                           bool_t  revifup,
-                           bool_t  reviflo,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      inc_t is_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   packm_ker,
-                           packm_thrinfo_t* thread
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
-
-extern func_t* packm_struc_cxk_kers;
-
-
-void bli_packm_blk_var1( obj_t*   c,
-                         obj_t*   p,
-                         packm_thrinfo_t* t )
-{
-	num_t     dt_cp      = bli_obj_dt( c );
-
-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
-	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
-
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
-
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
-
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	void*     buf_kappa;
-
-	func_t*   packm_kers;
-	void*     packm_ker;
-
-	FUNCPTR_T f;
-
-	// This variant assumes that the micro-kernel will always apply the
-	// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
-	// for kappa so that the underlying packm implementation does not
-	// scale during packing.
-	buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-
-	// Choose the correct func_t object.
-	packm_kers = packm_struc_cxk_kers;
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_obj_query( dt_cp, packm_kers );
-
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   t );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kertype ) \
-\
-void PASTEMAC(ch,varname) \
-                           struc_t strucc, \
-                           doff_t  diagoffc, \
-                           diag_t  diagc, \
-                           uplo_t  uploc, \
-                           trans_t transc, \
-                           pack_t  schema, \
-                           bool_t  invdiag, \
-                           bool_t  revifup, \
-                           bool_t  reviflo, \
-                           dim_t   m, \
-                           dim_t   n, \
-                           dim_t   m_max, \
-                           dim_t   n_max, \
-                           void*   kappa, \
-                           void*   c, inc_t rs_c, inc_t cs_c, \
-                           void*   p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t is_p, \
-                                      dim_t pd_p, inc_t ps_p, \
-                           void*   packm_ker, \
-                           packm_thrinfo_t* thread \
-                         ) \
-{ \
-	PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           num_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool_t          row_stored; \
-	bool_t          col_stored; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (num_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = num_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < num_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-				                       is_p ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = ldp * panel_len_max_i; \
-\
-			/* We nudge the panel increment up by one if it is odd. */ \
-			p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_begin, rs_c, cs_c, \
-				                p_begin, rs_p, cs_p, \
-				                         is_p ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			/*p_inc = ldp * panel_len_max_i;*/ \
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_begin, rs_c, cs_c, \
-				                p_begin, rs_p, cs_p, \
-				                         is_p ); \
-			} \
-/*
-			if ( row_stored ) \
-			PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
-			                      p_begin, rs_p, cs_p, "%9.2e", "" ); \
-			else if ( col_stored ) \
-			PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
-			                      p_begin, rs_p, cs_p, "%9.2e", "" ); \
-*/ \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			/*p_inc = ldp * panel_len_max_i;*/ \
-			p_inc = ps_p; \
-		} \
-\
-\
-		p_begin += p_inc; \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
-
--- a/frame/1m/packm/bli_packm_blk_var1rr.c
+++ b/frame/1m/packm/bli_packm_blk_var1rr.c
@@ -0,0 +1,737 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T packm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       pack_t  schema,
+       bool_t  invdiag,
+       bool_t  revifup,
+       bool_t  reviflo,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_max,
+       dim_t   n_max,
+       void*   kappa,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  inc_t is_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   packm_ker,
+       cntx_t* cntx,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1rr);
+
+
+static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
+{
+                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
+// 0000 row/col panels
+               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
+                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
+// 0001 row/col panels: 4m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_4mi,
+                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
+// 0010 row/col panels: 3m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
+               { { NULL,                      NULL,
+                   NULL,                      NULL,                      } },
+// 0100 row/col panels: 3m separated
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0101 row/col panels: real only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0110 row/col panels: imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0111 row/col panels: real+imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 1000 row/col panels: 1m-expanded (1e)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+// 1001 row/col panels: 1m-reordered (1r)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+};
+
+
+void bli_packm_blk_var1rr
+     (
+       obj_t*   c,
+       obj_t*   p,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       thrinfo_t* t
+     )
+{
+	num_t     dt_cp      = bli_obj_dt( c );
+
+	struc_t   strucc     = bli_obj_struc( c );
+	doff_t    diagoffc   = bli_obj_diag_offset( c );
+	diag_t    diagc      = bli_obj_diag( c );
+	uplo_t    uploc      = bli_obj_uplo( c );
+	trans_t   transc     = bli_obj_conjtrans_status( c );
+	pack_t    schema     = bli_obj_pack_schema( p );
+	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
+	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
+	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
+
+	dim_t     m_p        = bli_obj_length( p );
+	dim_t     n_p        = bli_obj_width( p );
+	dim_t     m_max_p    = bli_obj_padded_length( p );
+	dim_t     n_max_p    = bli_obj_padded_width( p );
+
+	void*     buf_c      = bli_obj_buffer_at_off( c );
+	inc_t     rs_c       = bli_obj_row_stride( c );
+	inc_t     cs_c       = bli_obj_col_stride( c );
+
+	void*     buf_p      = bli_obj_buffer_at_off( p );
+	inc_t     rs_p       = bli_obj_row_stride( p );
+	inc_t     cs_p       = bli_obj_col_stride( p );
+	inc_t     is_p       = bli_obj_imag_stride( p );
+	dim_t     pd_p       = bli_obj_panel_dim( p );
+	inc_t     ps_p       = bli_obj_panel_stride( p );
+
+	obj_t     kappa;
+	obj_t*    kappa_p;
+	void*     buf_kappa;
+
+	func_t*   packm_kers;
+	void*     packm_ker;
+
+	FUNCPTR_T f;
+
+
+	// Treatment of kappa (ie: packing during scaling) depends on
+	// whether we are executing an induced method.
+	if ( bli_is_nat_packed( schema ) )
+	{
+		// This branch is for native execution, where we assume that
+		// the micro-kernel will always apply the alpha scalar of the
+		// higher-level operation. Thus, we use BLIS_ONE for kappa so
+		// that the underlying packm implementation does not perform
+		// any scaling during packing.
+		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
+	}
+	else // if ( bli_is_ind_packed( schema ) )
+	{
+		// The value for kappa we use will depend on whether the scalar
+		// attached to A has a nonzero imaginary component. If it does,
+		// then we will apply the scalar during packing to facilitate
+		// implementing induced complex domain algorithms in terms of
+		// real domain micro-kernels. (In the aforementioned situation,
+		// applying a real scalar is easy, but applying a complex one is
+		// harder, so we avoid the need altogether with the code below.)
+		if ( bli_obj_scalar_has_nonzero_imag( p ) )
+		{
+			//printf( "applying non-zero imag kappa\n" );
+
+			// Detach the scalar.
+			bli_obj_scalar_detach( p, &kappa );
+
+			// Reset the attached scalar (to 1.0).
+			bli_obj_scalar_reset( p );
+
+			kappa_p = &kappa;
+		}
+		else
+		{
+			// If the internal scalar of A has only a real component, then
+			// we will apply it later (in the micro-kernel), and so we will
+			// use BLIS_ONE to indicate no scaling during packing.
+			kappa_p = &BLIS_ONE;
+		}
+
+		// Acquire the buffer to the kappa chosen above.
+		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
+	}
+
+
+	// Choose the correct func_t object based on the pack_t schema.
+#if 0
+	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
+	else if ( bli_is_3mi_packed( schema ) ||
+	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
+	else if ( bli_is_ro_packed( schema ) ||
+	          bli_is_io_packed( schema ) ||
+	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
+	else                                    packm_kers = packm_struc_cxk_kers;
+#else
+	// The original idea here was to read the packm_ukr from the context
+	// if it is non-NULL. The problem is, it requires that we be able to
+	// assume that the packm_ukr field is initialized to NULL, which it
+	// currently is not.
+
+	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
+
+	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// NULL (which is the default value after the context is created),
+		// we use the default lookup table to determine the right func_t
+		// for the current schema.
+		const dim_t i = bli_pack_schema_index( schema );
+
+		packm_kers = &packm_struc_cxk_kers[ i ];
+	}
+#if 0
+	else // cntx's packm func_t overrides
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// non-NULL (ie: assumed to be valid), we use that instead.
+		//packm_kers = bli_cntx_packm_ukrs( cntx );
+		packm_kers = cntx_packm_kers;
+	}
+#endif
+#endif
+
+	// Query the datatype-specific function pointer from the func_t object.
+	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_cp];
+
+	// Invoke the function.
+	f( strucc,
+	   diagoffc,
+	   diagc,
+	   uploc,
+	   transc,
+	   schema,
+	   invdiag,
+	   revifup,
+	   reviflo,
+	   m_p,
+	   n_p,
+	   m_max_p,
+	   n_max_p,
+	   buf_kappa,
+	   buf_c, rs_c, cs_c,
+	   buf_p, rs_p, cs_p,
+	          is_p,
+	          pd_p, ps_p,
+	   packm_ker,
+	   cntx,
+	   t );
+}
+
+
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       struc_t strucc, \
+       doff_t  diagoffc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       trans_t transc, \
+       pack_t  schema, \
+       bool_t  invdiag, \
+       bool_t  revifup, \
+       bool_t  reviflo, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+                  inc_t is_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       void*   packm_ker, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
+\
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict c_begin; \
+	ctype* restrict p_begin; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic, ip; \
+	dim_t           ic0, ip0; \
+	doff_t          ic_inc, ip_inc; \
+	doff_t          diagoffc_i; \
+	doff_t          diagoffc_inc; \
+	dim_t           panel_len_full; \
+	dim_t           panel_len_i; \
+	dim_t           panel_len_max; \
+	dim_t           panel_len_max_i; \
+	dim_t           panel_dim_i; \
+	dim_t           panel_dim_max; \
+	dim_t           panel_off_i; \
+	inc_t           vs_c; \
+	inc_t           ldc; \
+	inc_t           ldp, p_inc; \
+	dim_t*          m_panel_full; \
+	dim_t*          n_panel_full; \
+	dim_t*          m_panel_use; \
+	dim_t*          n_panel_use; \
+	dim_t*          m_panel_max; \
+	dim_t*          n_panel_max; \
+	conj_t          conjc; \
+	bool_t          row_stored; \
+	bool_t          col_stored; \
+	inc_t           is_p_use; \
+	dim_t           ss_num; \
+	dim_t           ss_den; \
+\
+	ctype* restrict c_use; \
+	ctype* restrict p_use; \
+	doff_t          diagoffp_i; \
+\
+\
+	/* If C is zeros and part of a triangular matrix, then we don't need
+	   to pack it. */ \
+	if ( bli_is_zeros( uploc ) && \
+	     bli_is_triangular( strucc ) ) return; \
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_negate_diag_offset( &diagoffc ); \
+		bli_toggle_uplo( &uploc ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	row_stored = bli_is_col_packed( schema ); \
+	col_stored = bli_is_row_packed( schema ); \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = rs_c; \
+		vs_c           = cs_c; \
+		diagoffc_inc   = -( doff_t )panel_dim_max; \
+		ldp            = rs_p; \
+		m_panel_full   = &m; \
+		n_panel_full   = &panel_dim_i; \
+		m_panel_use    = &panel_len_i; \
+		n_panel_use    = &panel_dim_i; \
+		m_panel_max    = &panel_len_max_i; \
+		n_panel_max    = &panel_dim_max; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = cs_c; \
+		vs_c           = rs_c; \
+		diagoffc_inc   = ( doff_t )panel_dim_max; \
+		ldp            = cs_p; \
+		m_panel_full   = &panel_dim_i; \
+		n_panel_full   = &n; \
+		m_panel_use    = &panel_dim_i; \
+		n_panel_use    = &panel_len_i; \
+		m_panel_max    = &panel_dim_max; \
+		n_panel_max    = &panel_len_max_i; \
+	} \
+\
+	/* Compute the storage stride scaling. Usually this is just 1. However,
+	   in the case of interleaved 3m, we need to scale by 3/2, and in the
+	   cases of real-only, imag-only, or summed-only, we need to scale by
+	   1/2. In both cases, we are compensating for the fact that pointer
+	   arithmetic occurs in terms of complex elements rather than real
+	   elements. */ \
+	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
+	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else                                    { ss_num = 1; ss_den = 1; } \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
+	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
+	{ \
+		ic0    = (n_iter - 1) * panel_dim_max; \
+		ic_inc = -panel_dim_max; \
+		ip0    = n_iter - 1; \
+		ip_inc = -1; \
+	} \
+	else \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+		ip0    = 0; \
+		ip_inc = 1; \
+	} \
+\
+	p_begin = p_cast; \
+\
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, ip += ip_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
+		c_begin     = c_cast   + (ic  )*vs_c; \
+\
+		if ( bli_is_triangular( strucc ) &&  \
+		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is completely unstored (ie: zero). If the panel
+			   is unstored, we do nothing. (Notice that we don't even
+			   increment p_begin.) */ \
+\
+			continue; \
+		} \
+		else if ( bli_is_triangular( strucc ) &&  \
+		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is diagonal-intersecting. Notice that we
+			   cannot bury the following conditional logic into
+			   packm_struc_cxk() because we need to know the value of
+			   panel_len_max_i so we can properly increment p_inc. */ \
+\
+			/* Sanity check. Diagonals should not intersect the short end of
+			   a micro-panel. If they do, then somehow the constraints on
+			   cache blocksizes being a whole multiple of the register
+			   blocksizes was somehow violated. */ \
+			if ( ( col_stored && diagoffc_i < 0 ) || \
+			     ( row_stored && diagoffc_i > 0 ) ) \
+				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+\
+			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
+			          ( col_stored && bli_is_lower( uploc ) ) )  \
+			{ \
+				panel_off_i     = 0; \
+				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
+				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
+				                           panel_len_max ); \
+				diagoffp_i      = diagoffc_i; \
+			} \
+			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
+			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
+			{ \
+				panel_off_i     = bli_abs( diagoffc_i ); \
+				panel_len_i     = panel_len_full - panel_off_i; \
+				panel_len_max_i = panel_len_max  - panel_off_i; \
+				diagoffp_i      = 0; \
+			} \
+\
+			c_use = c_begin + (panel_off_i  )*ldc; \
+			p_use = p_begin; \
+\
+			/* We need to re-compute the imaginary stride as a function of
+			   panel_len_max_i since triangular packed matrices have panels
+			   of varying lengths. NOTE: This imaginary stride value is
+			   only referenced by the packm kernels for induced methods. */ \
+			is_p_use  = ldp * panel_len_max_i; \
+\
+			/* We nudge the imaginary stride up by one if it is odd. */ \
+			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
+\
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffp_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is usually LESS than ps_p because triangular
+			   matrices usually have several micro-panels that are shorter
+			   than a "full" micro-panel. */ \
+			p_inc = ( is_p_use * ss_num ) / ss_den; \
+		} \
+		else if ( bli_is_herm_or_symm( strucc ) ) \
+		{ \
+			/* This case executes if the panel belongs to a Hermitian or
+			   symmetric matrix, which includes stored, unstored, and
+			   diagonal-intersecting panels. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffc_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			p_inc = ps_p; \
+		} \
+		else \
+		{ \
+			/* This case executes if the panel is general, or, if the
+			   panel is part of a triangular matrix and is neither unstored
+			   (ie: zero) nor diagonal-intersecting. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
+			{ \
+/*
+printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
+*/ \
+				packm_ker_cast( BLIS_GENERAL, \
+				                0, \
+				                diagc, \
+				                BLIS_DENSE, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is equivalent to ps_p. */ \
+			p_inc = ps_p; \
+		} \
+\
+		p_begin += p_inc; \
+\
+	} \
+/*
+printf( "thread %d: done\n", (int)tid ); \
+*/ \
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1rr )
+
+
+
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+if ( col_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+*/
+/*
+if ( col_stored ) { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+else { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+*/
+/*
+		if ( bli_is_4mi_packed( schema ) ) { \
+		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
+		if ( col_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		if ( row_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		} \
+*/
+/*
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+*/
+/*
+		if ( row_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		inc_t is_b = rs_p * *m_panel_max; \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
+/*
+		if ( col_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
--- a/frame/1m/packm/bli_packm_blk_var1sl.c
+++ b/frame/1m/packm/bli_packm_blk_var1sl.c
@@ -0,0 +1,737 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T packm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       pack_t  schema,
+       bool_t  invdiag,
+       bool_t  revifup,
+       bool_t  reviflo,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_max,
+       dim_t   n_max,
+       void*   kappa,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  inc_t is_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   packm_ker,
+       cntx_t* cntx,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1sl);
+
+
+static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
+{
+                /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
+// 0000 row/col panels
+               { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
+                   bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
+// 0001 row/col panels: 4m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_4mi,
+                   NULL,                      bli_zpackm_struc_cxk_4mi,  } },
+// 0010 row/col panels: 3m interleaved
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
+               { { NULL,                      NULL,
+                   NULL,                      NULL,                      } },
+// 0100 row/col panels: 3m separated
+               { { NULL,                      bli_cpackm_struc_cxk_3mis,
+                   NULL,                      bli_zpackm_struc_cxk_3mis, } },
+// 0101 row/col panels: real only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0110 row/col panels: imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 0111 row/col panels: real+imaginary only
+               { { NULL,                      bli_cpackm_struc_cxk_rih,
+                   NULL,                      bli_zpackm_struc_cxk_rih,  } },
+// 1000 row/col panels: 1m-expanded (1e)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+// 1001 row/col panels: 1m-reordered (1r)
+               { { NULL,                      bli_cpackm_struc_cxk_1er,
+                   NULL,                      bli_zpackm_struc_cxk_1er,  } },
+};
+
+
+void bli_packm_blk_var1sl
+     (
+       obj_t*   c,
+       obj_t*   p,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       thrinfo_t* t
+     )
+{
+	num_t     dt_cp      = bli_obj_dt( c );
+
+	struc_t   strucc     = bli_obj_struc( c );
+	doff_t    diagoffc   = bli_obj_diag_offset( c );
+	diag_t    diagc      = bli_obj_diag( c );
+	uplo_t    uploc      = bli_obj_uplo( c );
+	trans_t   transc     = bli_obj_conjtrans_status( c );
+	pack_t    schema     = bli_obj_pack_schema( p );
+	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
+	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
+	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
+
+	dim_t     m_p        = bli_obj_length( p );
+	dim_t     n_p        = bli_obj_width( p );
+	dim_t     m_max_p    = bli_obj_padded_length( p );
+	dim_t     n_max_p    = bli_obj_padded_width( p );
+
+	void*     buf_c      = bli_obj_buffer_at_off( c );
+	inc_t     rs_c       = bli_obj_row_stride( c );
+	inc_t     cs_c       = bli_obj_col_stride( c );
+
+	void*     buf_p      = bli_obj_buffer_at_off( p );
+	inc_t     rs_p       = bli_obj_row_stride( p );
+	inc_t     cs_p       = bli_obj_col_stride( p );
+	inc_t     is_p       = bli_obj_imag_stride( p );
+	dim_t     pd_p       = bli_obj_panel_dim( p );
+	inc_t     ps_p       = bli_obj_panel_stride( p );
+
+	obj_t     kappa;
+	obj_t*    kappa_p;
+	void*     buf_kappa;
+
+	func_t*   packm_kers;
+	void*     packm_ker;
+
+	FUNCPTR_T f;
+
+
+	// Treatment of kappa (ie: packing during scaling) depends on
+	// whether we are executing an induced method.
+	if ( bli_is_nat_packed( schema ) )
+	{
+		// This branch is for native execution, where we assume that
+		// the micro-kernel will always apply the alpha scalar of the
+		// higher-level operation. Thus, we use BLIS_ONE for kappa so
+		// that the underlying packm implementation does not perform
+		// any scaling during packing.
+		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
+	}
+	else // if ( bli_is_ind_packed( schema ) )
+	{
+		// The value for kappa we use will depend on whether the scalar
+		// attached to A has a nonzero imaginary component. If it does,
+		// then we will apply the scalar during packing to facilitate
+		// implementing induced complex domain algorithms in terms of
+		// real domain micro-kernels. (In the aforementioned situation,
+		// applying a real scalar is easy, but applying a complex one is
+		// harder, so we avoid the need altogether with the code below.)
+		if ( bli_obj_scalar_has_nonzero_imag( p ) )
+		{
+			//printf( "applying non-zero imag kappa\n" );
+
+			// Detach the scalar.
+			bli_obj_scalar_detach( p, &kappa );
+
+			// Reset the attached scalar (to 1.0).
+			bli_obj_scalar_reset( p );
+
+			kappa_p = &kappa;
+		}
+		else
+		{
+			// If the internal scalar of A has only a real component, then
+			// we will apply it later (in the micro-kernel), and so we will
+			// use BLIS_ONE to indicate no scaling during packing.
+			kappa_p = &BLIS_ONE;
+		}
+
+		// Acquire the buffer to the kappa chosen above.
+		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
+	}
+
+
+	// Choose the correct func_t object based on the pack_t schema.
+#if 0
+	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
+	else if ( bli_is_3mi_packed( schema ) ||
+	          bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
+	else if ( bli_is_ro_packed( schema ) ||
+	          bli_is_io_packed( schema ) ||
+	         bli_is_rpi_packed( schema ) )  packm_kers = packm_struc_cxk_rih_kers;
+	else                                    packm_kers = packm_struc_cxk_kers;
+#else
+	// The original idea here was to read the packm_ukr from the context
+	// if it is non-NULL. The problem is, it requires that we be able to
+	// assume that the packm_ukr field is initialized to NULL, which it
+	// currently is not.
+
+	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
+
+	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// NULL (which is the default value after the context is created),
+		// we use the default lookup table to determine the right func_t
+		// for the current schema.
+		const dim_t i = bli_pack_schema_index( schema );
+
+		packm_kers = &packm_struc_cxk_kers[ i ];
+	}
+#if 0
+	else // cntx's packm func_t overrides
+	{
+		// If the packm structure-aware kernel func_t in the context is
+		// non-NULL (ie: assumed to be valid), we use that instead.
+		//packm_kers = bli_cntx_packm_ukrs( cntx );
+		packm_kers = cntx_packm_kers;
+	}
+#endif
+#endif
+
+	// Query the datatype-specific function pointer from the func_t object.
+	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_cp];
+
+	// Invoke the function.
+	f( strucc,
+	   diagoffc,
+	   diagc,
+	   uploc,
+	   transc,
+	   schema,
+	   invdiag,
+	   revifup,
+	   reviflo,
+	   m_p,
+	   n_p,
+	   m_max_p,
+	   n_max_p,
+	   buf_kappa,
+	   buf_c, rs_c, cs_c,
+	   buf_p, rs_p, cs_p,
+	          is_p,
+	          pd_p, ps_p,
+	   packm_ker,
+	   cntx,
+	   t );
+}
+
+
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       struc_t strucc, \
+       doff_t  diagoffc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       trans_t transc, \
+       pack_t  schema, \
+       bool_t  invdiag, \
+       bool_t  revifup, \
+       bool_t  reviflo, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+                  inc_t is_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       void*   packm_ker, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
+\
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict c_begin; \
+	ctype* restrict p_begin; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic, ip; \
+	dim_t           ic0, ip0; \
+	doff_t          ic_inc, ip_inc; \
+	doff_t          diagoffc_i; \
+	doff_t          diagoffc_inc; \
+	dim_t           panel_len_full; \
+	dim_t           panel_len_i; \
+	dim_t           panel_len_max; \
+	dim_t           panel_len_max_i; \
+	dim_t           panel_dim_i; \
+	dim_t           panel_dim_max; \
+	dim_t           panel_off_i; \
+	inc_t           vs_c; \
+	inc_t           ldc; \
+	inc_t           ldp, p_inc; \
+	dim_t*          m_panel_full; \
+	dim_t*          n_panel_full; \
+	dim_t*          m_panel_use; \
+	dim_t*          n_panel_use; \
+	dim_t*          m_panel_max; \
+	dim_t*          n_panel_max; \
+	conj_t          conjc; \
+	bool_t          row_stored; \
+	bool_t          col_stored; \
+	inc_t           is_p_use; \
+	dim_t           ss_num; \
+	dim_t           ss_den; \
+\
+	ctype* restrict c_use; \
+	ctype* restrict p_use; \
+	doff_t          diagoffp_i; \
+\
+\
+	/* If C is zeros and part of a triangular matrix, then we don't need
+	   to pack it. */ \
+	if ( bli_is_zeros( uploc ) && \
+	     bli_is_triangular( strucc ) ) return; \
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_negate_diag_offset( &diagoffc ); \
+		bli_toggle_uplo( &uploc ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	row_stored = bli_is_col_packed( schema ); \
+	col_stored = bli_is_row_packed( schema ); \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = rs_c; \
+		vs_c           = cs_c; \
+		diagoffc_inc   = -( doff_t )panel_dim_max; \
+		ldp            = rs_p; \
+		m_panel_full   = &m; \
+		n_panel_full   = &panel_dim_i; \
+		m_panel_use    = &panel_len_i; \
+		n_panel_use    = &panel_dim_i; \
+		m_panel_max    = &panel_len_max_i; \
+		n_panel_max    = &panel_dim_max; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		ldc            = cs_c; \
+		vs_c           = rs_c; \
+		diagoffc_inc   = ( doff_t )panel_dim_max; \
+		ldp            = cs_p; \
+		m_panel_full   = &panel_dim_i; \
+		n_panel_full   = &n; \
+		m_panel_use    = &panel_dim_i; \
+		n_panel_use    = &panel_len_i; \
+		m_panel_max    = &panel_dim_max; \
+		n_panel_max    = &panel_len_max_i; \
+	} \
+\
+	/* Compute the storage stride scaling. Usually this is just 1. However,
+	   in the case of interleaved 3m, we need to scale by 3/2, and in the
+	   cases of real-only, imag-only, or summed-only, we need to scale by
+	   1/2. In both cases, we are compensating for the fact that pointer
+	   arithmetic occurs in terms of complex elements rather than real
+	   elements. */ \
+	if      ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
+	else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
+	else                                    { ss_num = 1; ss_den = 1; } \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
+	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
+	{ \
+		ic0    = (n_iter - 1) * panel_dim_max; \
+		ic_inc = -panel_dim_max; \
+		ip0    = n_iter - 1; \
+		ip_inc = -1; \
+	} \
+	else \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+		ip0    = 0; \
+		ip_inc = 1; \
+	} \
+\
+	p_begin = p_cast; \
+\
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, ip += ip_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
+		c_begin     = c_cast   + (ic  )*vs_c; \
+\
+		if ( bli_is_triangular( strucc ) &&  \
+		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is completely unstored (ie: zero). If the panel
+			   is unstored, we do nothing. (Notice that we don't even
+			   increment p_begin.) */ \
+\
+			continue; \
+		} \
+		else if ( bli_is_triangular( strucc ) &&  \
+		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
+		{ \
+			/* This case executes if the panel belongs to a triangular
+			   matrix AND is diagonal-intersecting. Notice that we
+			   cannot bury the following conditional logic into
+			   packm_struc_cxk() because we need to know the value of
+			   panel_len_max_i so we can properly increment p_inc. */ \
+\
+			/* Sanity check. Diagonals should not intersect the short end of
+			   a micro-panel. If they do, then somehow the constraints on
+			   cache blocksizes being a whole multiple of the register
+			   blocksizes was somehow violated. */ \
+			if ( ( col_stored && diagoffc_i < 0 ) || \
+			     ( row_stored && diagoffc_i > 0 ) ) \
+				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+\
+			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
+			          ( col_stored && bli_is_lower( uploc ) ) )  \
+			{ \
+				panel_off_i     = 0; \
+				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
+				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
+				                           panel_len_max ); \
+				diagoffp_i      = diagoffc_i; \
+			} \
+			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
+			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
+			{ \
+				panel_off_i     = bli_abs( diagoffc_i ); \
+				panel_len_i     = panel_len_full - panel_off_i; \
+				panel_len_max_i = panel_len_max  - panel_off_i; \
+				diagoffp_i      = 0; \
+			} \
+\
+			c_use = c_begin + (panel_off_i  )*ldc; \
+			p_use = p_begin; \
+\
+			/* We need to re-compute the imaginary stride as a function of
+			   panel_len_max_i since triangular packed matrices have panels
+			   of varying lengths. NOTE: This imaginary stride value is
+			   only referenced by the packm kernels for induced methods. */ \
+			is_p_use  = ldp * panel_len_max_i; \
+\
+			/* We nudge the imaginary stride up by one if it is odd. */ \
+			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
+\
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffp_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is usually LESS than ps_p because triangular
+			   matrices usually have several micro-panels that are shorter
+			   than a "full" micro-panel. */ \
+			p_inc = ( is_p_use * ss_num ) / ss_den; \
+		} \
+		else if ( bli_is_herm_or_symm( strucc ) ) \
+		{ \
+			/* This case executes if the panel belongs to a Hermitian or
+			   symmetric matrix, which includes stored, unstored, and
+			   diagonal-intersecting panels. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				packm_ker_cast( strucc, \
+				                diagoffc_i, \
+				                diagc, \
+				                uploc, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			p_inc = ps_p; \
+		} \
+		else \
+		{ \
+			/* This case executes if the panel is general, or, if the
+			   panel is part of a triangular matrix and is neither unstored
+			   (ie: zero) nor diagonal-intersecting. */ \
+\
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			is_p_use = is_p; \
+\
+			if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
+			{ \
+/*
+printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
+*/ \
+				packm_ker_cast( BLIS_GENERAL, \
+				                0, \
+				                diagc, \
+				                BLIS_DENSE, \
+				                conjc, \
+				                schema, \
+				                invdiag, \
+				                *m_panel_use, \
+				                *n_panel_use, \
+				                *m_panel_max, \
+				                *n_panel_max, \
+				                kappa_cast, \
+				                c_use, rs_c, cs_c, \
+				                p_use, rs_p, cs_p, \
+			                           is_p_use, \
+				                cntx ); \
+			} \
+\
+			/* NOTE: This value is equivalent to ps_p. */ \
+			p_inc = ps_p; \
+		} \
+\
+		p_begin += p_inc; \
+\
+	} \
+/*
+printf( "thread %d: done\n", (int)tid ); \
+*/ \
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1sl )
+
+
+
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+if ( col_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+*/
+/*
+if ( col_stored ) { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+else { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+*/
+/*
+		if ( bli_is_4mi_packed( schema ) ) { \
+		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
+		if ( col_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		if ( row_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		} \
+*/
+/*
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+*/
+/*
+		if ( row_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		inc_t is_b = rs_p * *m_panel_max; \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
+/*
+		if ( col_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/1m/packm/bli_packm_thrinfo.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -36,7 +37,22 @@
 // thrinfo_t macros specific to packm.
 //

-#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+/*
+#define bli_packm_thread_my_iter( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )
+*/
+
+#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
+\
+	( i % n_way == work_id % n_way )
+
+#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
+\
+	( start <= i && i < end )
+
+
+

 //
 // thrinfo_t APIs specific to packm.
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -32,15 +33,52 @@

 */

-void bli_packm_blk_var1
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
+//
+// Prototype object-based interfaces.
+//
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTEMAC0(opname) \
+     ( \
+       obj_t*   c, \
+       obj_t*   p, \
+       cntx_t*  cntx, \
+       cntl_t*  cntl, \
+       thrinfo_t* t  \
     );

+GENPROT( packm_unb_var1   )
+GENPROT( packm_blk_var1   )
+GENPROT( packm_blk_var1sl )
+GENPROT( packm_blk_var1rr )
+
+//
+// Prototype BLAS-like interfaces with void pointer operands.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       struc_t strucc, \
+       doff_t  diagoffc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       trans_t transc, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx  \
+     );
+
+INSERT_GENTPROT_BASIC0( packm_unb_var1 )

 #undef  GENTPROT
 #define GENTPROT( ctype, ch, varname ) \
@@ -70,5 +108,6 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( packm_blk_var1 )
+INSERT_GENTPROT_BASIC0( packm_blk_var1sl )
+INSERT_GENTPROT_BASIC0( packm_blk_var1rr )

--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -53,11 +54,34 @@ cntl_t* bli_gemmbp_cntl_create
       pack_t schema_b
     )
 {
-	void* macro_kernel_p = bli_gemm_ker_var2;
+	void* macro_kernel_fp;
+	void* packa_fp;
+	void* packb_fp;

-	// Change the macro-kernel if the operation family is herk or trmm.
-	if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
-	else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+	// Use the function pointers to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2sl;
+	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2sl;
+	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2sl;
+	else                            macro_kernel_fp = NULL;
+
+	packa_fp = bli_packm_blk_var1sl;
+	packb_fp = bli_packm_blk_var1sl;
+
+#else // BLIS_ENABLE_JRIR_RR
+
+	// Use the function pointers to the macrokernels that use round-robin
+	// assignment of micropanels to threads in the jr and ir loops.
+	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2rr;
+	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2rr;
+	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2rr;
+	else                            macro_kernel_fp = NULL;
+
+	packa_fp = bli_packm_blk_var1rr;
+	packb_fp = bli_packm_blk_var1rr;
+#endif

 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
@@ -72,7 +96,7 @@ cntl_t* bli_gemmbp_cntl_create
 	(
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
+	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);

@@ -80,7 +104,7 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -104,7 +128,7 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_KR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
--- a/frame/3/gemm/bli_gemm_int.c
+++ b/frame/3/gemm/bli_gemm_int.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -114,7 +115,9 @@ void bli_gemm_int

 		if ( im != BLIS_NAT )
 		{
-			if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
+			if ( im == BLIS_4M1B )
+			if ( f == bli_gemm_ker_var2sl ||
+			     f == bli_gemm_ker_var2rr ) f = bli_gemm4mb_ker_var2;
 		}
 	}

--- a/frame/3/gemm/bli_gemm_ker_var1.c
+++ b/frame/3/gemm/bli_gemm_ker_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -32,6 +33,8 @@

 */

+#if 0
+
 #include "blis.h"

 void bli_gemm_ker_var1
@@ -55,3 +58,5 @@ void bli_gemm_ker_var1
 	bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread );
 }

+#endif
+
--- a/frame/3/gemm/bli_gemm_ker_var2rr.c
+++ b/frame/3/gemm/bli_gemm_ker_var2rr.c
@@ -56,10 +56,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr);

+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//

-void bli_gemm_ker_var2
+void bli_gemm_ker_var2rr
     (
       obj_t*  a,
       obj_t*  b,
@@ -117,7 +120,6 @@ void bli_gemm_ker_var2
    // real-valued beta, we can use the real domain macro-kernel, which
 	// eliminates a little overhead associated with the 1m virtual
 	// micro-kernel.
-#if 1
 	if ( bli_is_1m_packed( schema_a ) )
 	{
 		bli_l3_ind_recast_1m_params
@@ -131,7 +133,6 @@ void bli_gemm_ker_var2
 		  rs_c, cs_c
 		);
 	}
-#endif

 	// Index into the type combination array to extract the correct
 	// function pointer.
@@ -285,8 +286,8 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_inc,   ir_inc; \
 \
 	/* Determine the thread range and increment for each thrinfo_t node. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_start; j < jr_end; j += jr_inc ) \
@@ -315,11 +316,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -369,11 +370,11 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
 */ \
 }

-INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )

--- a/frame/3/gemm/bli_gemm_ker_var2sl.c
+++ b/frame/3/gemm/bli_gemm_ker_var2sl.c
@@ -0,0 +1,380 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_gemm_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// If 1m is being employed on a column- or row-stored matrix with a
+	// real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
+
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -58,7 +59,9 @@ GENPROT( gemm_packa )
 GENPROT( gemm_packb )

 GENPROT( gemm_ker_var1 )
-GENPROT( gemm_ker_var2 )
+
+GENPROT( gemm_ker_var2sl )
+GENPROT( gemm_ker_var2rr )

 // Headers for induced algorithms:
 GENPROT( gemm4mb_ker_var2 ) // 4m1b
@@ -90,7 +93,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )

 // Headers for induced algorithms:
 INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b
--- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
+++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
@@ -300,11 +300,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
 \
--- a/frame/3/herk/bli_herk_l_ker_var2rr.c
+++ b/frame/3/herk/bli_herk_l_ker_var2rr.c
@@ -0,0 +1,555 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_herk_l_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr )
+
--- a/frame/3/herk/bli_herk_l_ker_var2sl.c
+++ b/frame/3/herk/bli_herk_l_ker_var2sl.c
@@ -57,10 +57,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_herk_l_ker_var2
+void bli_herk_l_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -322,10 +325,8 @@ void PASTEMAC(ch,varname) \
 		n_iter_tri = n_iter - n_iter_rct; \
 	} \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of C (if it exists). For both the
-	   rectangular and triangular regions, use contiguous assignment for the
-	   1st loop as well. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of C (if it exists). */ \
 	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
@@ -360,11 +361,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -425,8 +426,9 @@ void PASTEMAC(ch,varname) \
 	/* If there is no triangular region, then we're done. */ \
 	if ( n_iter_tri == 0 ) return; \
 \
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd loop for the remaining triangular region of C. */ \
+	/* Use round-robin assignment of micropanels to threads in the 2nd
+	   loop and slab partitioning in the 1st loop for the remaining
+	   triangular region of C. */ \
 	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Advance the start and end iteration offsets for the triangular region
@@ -464,11 +466,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -550,5 +552,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl )

--- a/frame/3/herk/bli_herk_u_ker_var2rr.c
+++ b/frame/3/herk/bli_herk_u_ker_var2rr.c
@@ -0,0 +1,557 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_herk_u_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+    f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial triangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr )
+
--- a/frame/3/herk/bli_herk_u_ker_var2sl.c
+++ b/frame/3/herk/bli_herk_u_ker_var2sl.c
@@ -57,10 +57,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_herk_u_ker_var2
+void bli_herk_u_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -118,7 +121,7 @@ void bli_herk_u_ker_var2

 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
+    f = ftypes[dt_exec];

 	// Invoke the function.
 	f( diagoffc,
@@ -324,10 +327,9 @@ void PASTEMAC(ch,varname) \
 		n_iter_rct = n_iter - n_iter_tri; \
 	} \
 \
-	/* Use interleaved (round robin) assignment of micropanels to threads in the
-	   2nd loop for the initial triangular region of C (if it exists). For both
-	   the rectangular and triangular regions, use contiguous assignment for the
-	   1st loop. */ \
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and slab partitioning in the 1st loop for the initial triangular region
+	   of C (if it exists). */ \
 	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
@@ -361,11 +363,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -449,8 +451,8 @@ void PASTEMAC(ch,varname) \
 	/* If there is no rectangular region, then we're done. */ \
 	if ( n_iter_rct == 0 ) return; \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
-	   the remaining triangular region of C. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st loops
+	   loop for the remaining triangular region of C. */ \
 	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Advance the start and end iteration offsets for the rectangular region
@@ -489,11 +491,11 @@ void PASTEMAC(ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -552,5 +554,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl )

--- a/frame/3/herk/bli_herk_var.h
+++ b/frame/3/herk/bli_herk_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -55,9 +56,13 @@ void PASTEMAC0(opname) \
 //GENPROT( herk_blk_var2 )
 //GENPROT( herk_blk_var3 )

-GENPROT( herk_x_ker_var2 )
-GENPROT( herk_l_ker_var2 )
-GENPROT( herk_u_ker_var2 )
+GENPROT( herk_x_ker_var2sl )
+GENPROT( herk_x_ker_var2rr )
+
+GENPROT( herk_l_ker_var2sl )
+GENPROT( herk_l_ker_var2rr )
+GENPROT( herk_u_ker_var2sl )
+GENPROT( herk_u_ker_var2rr )
 //GENPROT( herk_packa )
 //GENPROT( herk_packb )

@@ -89,6 +94,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
-INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )
+INSERT_GENTPROT_BASIC0( herk_l_ker_var2sl )
+INSERT_GENTPROT_BASIC0( herk_l_ker_var2rr )
+INSERT_GENTPROT_BASIC0( herk_u_ker_var2sl )
+INSERT_GENTPROT_BASIC0( herk_u_ker_var2rr )

--- a/frame/3/herk/bli_herk_x_ker_var2.c
+++ b/frame/3/herk/bli_herk_x_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -34,12 +35,12 @@

 #include "blis.h"

-static gemm_var_oft vars[2] =
+static gemm_var_oft vars_sl[2] =
 {
-	bli_herk_l_ker_var2, bli_herk_u_ker_var2,
+	bli_herk_l_ker_var2sl, bli_herk_u_ker_var2sl,
 };

-void bli_herk_x_ker_var2
+void bli_herk_x_ker_var2sl
     (
       obj_t*  a,
       obj_t*  ah,
@@ -58,7 +59,48 @@ void bli_herk_x_ker_var2
 	else                              uplo = 1;

 	// Index into the variant array to extract the correct function pointer.
-	f = vars[uplo];
+	f = vars_sl[uplo];
+
+	// Call the macrokernel.
+	f
+	(
+	  a,
+	  ah,
+	  c,
+	  cntx,
+	  rntm,
+	  cntl,
+	  thread
+	);
+}
+
+// -----------------------------------------------------------------------------
+
+static gemm_var_oft vars_rr[2] =
+{
+	bli_herk_l_ker_var2rr, bli_herk_u_ker_var2rr,
+};
+
+void bli_herk_x_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  ah,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	bool_t       uplo;
+	gemm_var_oft f;
+
+	// Set a bool based on the uplo field of C's root object.
+	if ( bli_obj_root_is_lower( c ) ) uplo = 0;
+	else                              uplo = 1;
+
+	// Index into the variant array to extract the correct function pointer.
+	f = vars_rr[uplo];

 	// Call the macrokernel.
 	f
--- a/frame/3/trmm/bli_trmm_ll_ker_var2rr.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2rr.c
@@ -0,0 +1,535 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_ll_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). 
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr )
+
--- a/frame/3/trmm/bli_trmm_ll_ker_var2sl.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trmm_ll_ker_var2
+void bli_trmm_ll_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -333,10 +336,9 @@ void PASTEMAC(ch,varname) \
 	/*dim_t ir_start, ir_end;*/ \
 	dim_t jr_inc; \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of C (if it exists). For both the
-	   rectangular and triangular regions, use contiguous assignment for the
-	   1st loop as well. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists).
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
 	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
@@ -393,11 +395,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -457,17 +459,18 @@ void PASTEMAC(ch,varname) \
 			} \
 			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
 				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -524,9 +527,9 @@ void PASTEMAC(ch,varname) \
 			c11 += rstep_c; \
 		} \
 	} \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

-INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl )

--- a/frame/3/trmm/bli_trmm_lu_ker_var2rr.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2rr.c
@@ -0,0 +1,542 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_lu_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr )
+
--- a/frame/3/trmm/bli_trmm_lu_ker_var2sl.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trmm_lu_ker_var2
+void bli_trmm_lu_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -340,10 +343,8 @@ void PASTEMAC(ch,varname) \
 	/*dim_t ir_start, ir_end;*/ \
 	dim_t jr_inc; \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of C (if it exists). For both the
-	   rectangular and triangular regions, use contiguous assignment for the
-	   1st loop as well. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). */ \
 	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
@@ -393,17 +394,18 @@ void PASTEMAC(ch,varname) \
 				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
 				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
 \
+				/* NOTE: ir loop parallelism disabled for now. */ \
 				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -463,17 +465,18 @@ void PASTEMAC(ch,varname) \
 			} \
 			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
 				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -531,9 +534,9 @@ void PASTEMAC(ch,varname) \
 		} \
 	} \
 \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

-INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl )

--- a/frame/3/trmm/bli_trmm_rl_ker_var2rr.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2rr.c
@@ -0,0 +1,598 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_rl_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of B (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr )
+
--- a/frame/3/trmm/bli_trmm_rl_ker_var2sl.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trmm_rl_ker_var2
+void bli_trmm_rl_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -360,10 +363,8 @@ void PASTEMAC(ch,varname) \
 		n_iter_tri = n_iter - n_iter_rct; \
 	} \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of B (if it exists). For both the
-	   rectangular and triangular regions, use contiguous assignment for the
-	   1st loop as well. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of B (if it exists). */ \
 	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
@@ -399,11 +400,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -455,12 +456,12 @@ void PASTEMAC(ch,varname) \
 	/* If there is no triangular region, then we're done. */ \
 	if ( n_iter_tri == 0 ) return; \
 \
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-       the 2nd loop for the remaining triangular region of B (if it exists).
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the remaining triangular region of B (if it exists).
 	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
 	   employ a hack that calls for each thread to execute every iteration
-	   of the jr loop but skip all but the pointer increment for iterations
-	   that are not assigned to it. */ \
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
 \
 	/* Advance the starting b1 and c1 pointers to the positions corresponding
 	   to the start of the triangular region of B. */ \
@@ -522,11 +523,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -589,9 +590,9 @@ void PASTEMAC(ch,varname) \
 		c1 += cstep_c; \
 	} \
 \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

-INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl )

--- a/frame/3/trmm/bli_trmm_ru_ker_var2rr.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2rr.c
@@ -0,0 +1,618 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_ru_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j, jb0; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter_tri; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops the remaining triangular region of B. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+\
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr )
+
--- a/frame/3/trmm/bli_trmm_ru_ker_var2sl.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trmm_ru_ker_var2
+void bli_trmm_ru_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -365,12 +368,12 @@ void PASTEMAC(ch,varname) \
 		n_iter_rct = n_iter - n_iter_tri; \
 	} \
 \
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd loop for the initial triangular region of B (if it exists).
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the initial triangular region of B (if it exists).
 	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
 	   employ a hack that calls for each thread to execute every iteration
-	   of the jr loop but skip all but the pointer increment for iterations
-	   that are not assigned to it. */ \
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
@@ -428,11 +431,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -498,8 +501,8 @@ void PASTEMAC(ch,varname) \
 	/* If there is no rectangular region, then we're done. */ \
 	if ( n_iter_rct == 0 ) return; \
 \
-	/* Use contiguous assignment of micropanels to threads in both the 2nd and
-	   1st loops the remaining triangular region of B. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops the remaining triangular region of B. */ \
 	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
@@ -552,11 +555,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -607,9 +610,9 @@ void PASTEMAC(ch,varname) \
 \
 \
 \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

-INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl )

--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -55,11 +56,17 @@ void PASTEMAC0(opname) \
 //GENPROT( trmm_blk_var2 )
 //GENPROT( trmm_blk_var3 )

-GENPROT( trmm_xx_ker_var2 )
-GENPROT( trmm_ll_ker_var2 )
-GENPROT( trmm_lu_ker_var2 )
-GENPROT( trmm_rl_ker_var2 )
-GENPROT( trmm_ru_ker_var2 )
+GENPROT( trmm_xx_ker_var2sl )
+GENPROT( trmm_xx_ker_var2rr )
+
+GENPROT( trmm_ll_ker_var2sl )
+GENPROT( trmm_ll_ker_var2rr )
+GENPROT( trmm_lu_ker_var2sl )
+GENPROT( trmm_lu_ker_var2rr )
+GENPROT( trmm_rl_ker_var2sl )
+GENPROT( trmm_rl_ker_var2rr )
+GENPROT( trmm_ru_ker_var2sl )
+GENPROT( trmm_ru_ker_var2rr )


 //
@@ -89,8 +96,12 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
-INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )
+INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2rr )
+INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2rr )
+INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2rr )
+INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2rr )

--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -34,13 +35,13 @@

 #include "blis.h"

-static gemm_var_oft vars[2][2] =
+static gemm_var_oft vars_sl[2][2] =
 {
-	{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
-	{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
+	{ bli_trmm_ll_ker_var2sl, bli_trmm_lu_ker_var2sl },
+	{ bli_trmm_rl_ker_var2sl, bli_trmm_ru_ker_var2sl }
 };

-void bli_trmm_xx_ker_var2
+void bli_trmm_xx_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -72,7 +73,62 @@ void bli_trmm_xx_ker_var2
 	}

 	// Index into the variant array to extract the correct function pointer.
-	f = vars[side][uplo];
+	f = vars_sl[side][uplo];
+
+	// Call the macrokernel.
+	f
+	(
+	  a,
+	  b,
+	  c,
+	  cntx,
+	  rntm,
+	  cntl,
+	  thread
+	);
+}
+
+// -----------------------------------------------------------------------------
+
+static gemm_var_oft vars_rr[2][2] =
+{
+	{ bli_trmm_ll_ker_var2rr, bli_trmm_lu_ker_var2rr },
+	{ bli_trmm_rl_ker_var2rr, bli_trmm_ru_ker_var2rr }
+};
+
+void bli_trmm_xx_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	bool_t       side;
+	bool_t       uplo;
+	gemm_var_oft f;
+
+	// Set two bools: one based on the implied side parameter (the structure
+	// of the root object) and one based on the uplo field of the triangular
+	// matrix's root object (whether that is matrix A or matrix B).
+	if ( bli_obj_root_is_triangular( a ) )
+	{
+		side = 0;
+		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+	else // if ( bli_obj_root_is_triangular( b ) )
+	{
+		side = 1;
+		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+
+	// Index into the variant array to extract the correct function pointer.
+	f = vars_rr[side][uplo];

 	// Call the macrokernel.
 	f
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -53,7 +54,28 @@ cntl_t* bli_trsm_l_cntl_create
       pack_t schema_b
     )
 {
-	void* macro_kernel_p = bli_trsm_xx_ker_var2;
+	void* macro_kernel_p;
+	void* packa_fp;
+	void* packb_fp;
+
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+	// Use the function pointer to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_p = bli_trsm_xx_ker_var2sl;
+
+	packa_fp = bli_packm_blk_var1sl;
+	packb_fp = bli_packm_blk_var1sl;
+
+#else // BLIS_ENABLE_JRIR_RR
+
+	// Use the function pointer to the macrokernels that use round-robin
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_p = bli_trsm_xx_ker_var2rr;
+
+	packa_fp = bli_packm_blk_var1rr;
+	packb_fp = bli_packm_blk_var1rr;
+#endif

 	const opid_t family = BLIS_TRSM;

@@ -78,7 +100,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packa,
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_MR,
 	  TRUE,    // do NOT invert diagonal
@@ -102,7 +124,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packb,
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_MR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
@@ -140,7 +162,16 @@ cntl_t* bli_trsm_r_cntl_create
       pack_t schema_b
     )
 {
-	void* macro_kernel_p = bli_trsm_xx_ker_var2;
+	// trsm macrokernels are presently disabled for right-side execution,
+	// so it doesn't matter which function pointer we use here (sl or rr).
+	// To be safe, we'll insert an abort() guard to alert the developers
+	// of this should right-side macrokernels ever be re-enabled.
+	void* macro_kernel_p = bli_trsm_xx_ker_var2sl;
+
+	void* packa_fp = bli_packm_blk_var1sl;
+	void* packb_fp = bli_packm_blk_var1sl;
+
+	bli_abort();

 	const opid_t family = BLIS_TRSM;

@@ -165,7 +196,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packa,
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_NR,
 	  BLIS_MR,
 	  FALSE,   // do NOT invert diagonal
@@ -189,7 +220,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packb,
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_MR,
 	  BLIS_MR,
 	  TRUE,    // do NOT invert diagonal
--- a/frame/3/trsm/bli_trsm_ll_ker_var2rr.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2rr.c
@@ -0,0 +1,605 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trsm_ll_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr )
+
--- a/frame/3/trsm/bli_trsm_ll_ker_var2sl.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trsm_ll_ker_var2
+void bli_trsm_ll_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -352,7 +355,9 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
 	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
@@ -419,11 +424,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -483,11 +488,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -596,5 +601,5 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1,
 */ \
 }

-INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl )

--- a/frame/3/trsm/bli_trsm_lu_ker_var2rr.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2rr.c
@@ -0,0 +1,586 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trsm_lu_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr )
+
--- a/frame/3/trsm/bli_trsm_lu_ker_var2sl.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2sl.c
@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
       thrinfo_t* thread
     );

-static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl);

+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//

-void bli_trsm_lu_ker_var2
+void bli_trsm_lu_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -360,7 +363,9 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_start, jr_end; \
 	dim_t jr_inc; \
 \
-	/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
+	/* Use slab assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
 	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
@@ -429,11 +434,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -493,11 +498,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -577,5 +582,5 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur,
 */ \
 }

-INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl )

--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -441,12 +442,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -521,12 +522,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -434,12 +435,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -514,12 +515,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -57,9 +58,14 @@ GENPROT( trsm_blk_var3 )
 GENPROT( trsm_packa )
 GENPROT( trsm_packb )

-GENPROT( trsm_xx_ker_var2 )
-GENPROT( trsm_ll_ker_var2 )
-GENPROT( trsm_lu_ker_var2 )
+GENPROT( trsm_xx_ker_var2sl )
+GENPROT( trsm_xx_ker_var2rr )
+
+GENPROT( trsm_ll_ker_var2sl )
+GENPROT( trsm_ll_ker_var2rr )
+GENPROT( trsm_lu_ker_var2sl )
+GENPROT( trsm_lu_ker_var2rr )
+
 GENPROT( trsm_rl_ker_var2 )
 GENPROT( trsm_ru_ker_var2 )

@@ -91,8 +97,11 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
+INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2rr )
+INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2sl )
+INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2rr )
+
 INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
 INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )

--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -34,13 +35,13 @@

 #include "blis.h"

-static trsm_var_oft vars[2][2] =
+static trsm_var_oft vars_sl[2][2] =
 {
-	{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
-	{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
+	{ bli_trsm_ll_ker_var2sl, bli_trsm_lu_ker_var2sl },
+	{ bli_trsm_rl_ker_var2  , bli_trsm_ru_ker_var2   }
 };

-void bli_trsm_xx_ker_var2
+void bli_trsm_xx_ker_var2sl
     (
       obj_t*  a,
       obj_t*  b,
@@ -72,7 +73,62 @@ void bli_trsm_xx_ker_var2
 	}

 	// Index into the variant array to extract the correct function pointer.
-	f = vars[side][uplo];
+	f = vars_sl[side][uplo];
+
+	// Call the macrokernel.
+	f
+	(
+	  a,
+	  b,
+	  c,
+	  cntx,
+	  rntm,
+	  cntl,
+	  thread
+	);
+}
+
+// -----------------------------------------------------------------------------
+
+static trsm_var_oft vars_rr[2][2] =
+{
+	{ bli_trsm_ll_ker_var2rr, bli_trsm_lu_ker_var2rr },
+	{ bli_trsm_rl_ker_var2  , bli_trsm_ru_ker_var2   }
+};
+
+void bli_trsm_xx_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	bool_t       side;
+	bool_t       uplo;
+	trsm_var_oft f;
+
+	// Set two bools: one based on the implied side parameter (the structure
+	// of the root object) and one based on the uplo field of the triangular
+	// matrix's root object (whether that is matrix A or matrix B).
+	if ( bli_obj_root_is_triangular( a ) )
+	{
+		side = 0;
+		if ( bli_obj_root_is_lower( a ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+	else // if ( bli_obj_root_is_triangular( b ) )
+	{
+		side = 1;
+		if ( bli_obj_root_is_lower( b ) ) uplo = 0;
+		else                              uplo = 1;
+	}
+
+	// Index into the variant array to extract the correct function pointer.
+	f = vars_rr[side][uplo];

 	// Call the macrokernel.
 	f
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -792,14 +792,16 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
 	       ( i != 0 || n_left == 0 );
 }

-static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+static bool_t bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
 {
 	return ( bool_t )
-#ifdef BLIS_JRIR_INTERLEAVE
-	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
-#else
 	       ( i == end_iter - 1 );
-#endif
+}
+
+static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+{
+	return ( bool_t )
+	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
 }


--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -232,8 +232,6 @@ void  bli_thread_init_rntm_from_env( rntm_t* rntm );

 // -----------------------------------------------------------------------------

-//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc );
-
 static void bli_thread_range_jrir_rr
     (
       thrinfo_t* thread,
@@ -242,7 +240,7 @@ static void bli_thread_range_jrir_rr
       bool_t     handle_edge_low,
       dim_t*     start,
       dim_t*     end,
-	   dim_t*     inc
+       dim_t*     inc
     )
 {
 	// Use interleaved partitioning of jr/ir loops.
@@ -259,7 +257,7 @@ static void bli_thread_range_jrir_sl
       bool_t     handle_edge_low,
       dim_t*     start,
       dim_t*     end,
-	   dim_t*     inc
+       dim_t*     inc
     )
 {
 	// Use contiguous slab partitioning of jr/ir loops.
@@ -267,6 +265,7 @@ static void bli_thread_range_jrir_sl
 	*inc = 1;
 }

+#if 0
 static void bli_thread_range_jrir
     (
       thrinfo_t* thread,
@@ -275,14 +274,13 @@ static void bli_thread_range_jrir
       bool_t     handle_edge_low,
       dim_t*     start,
       dim_t*     end,
-	   dim_t*     inc
+       dim_t*     inc
     )
 {
-//#ifdef BLIS_JRIR_INTERLEAVE
-#if 0
-	bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
-#else
+#ifdef BLIS_ENABLE_JRIR_SLAB
 	bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
+#else
+	bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
 #endif
 }

@@ -297,45 +295,30 @@ static void bli_thread_range_weighted_jrir
       bool_t     handle_edge_low,
       dim_t*     start,
       dim_t*     end,
-	   dim_t*     inc
+       dim_t*     inc
     )
 {
-#ifdef BLIS_JRIR_INTERLEAVE
-	// Use interleaved partitioning of jr/ir loops.
-	*start = bli_thread_work_id( thread );
-	*inc   = bli_thread_n_way( thread );
-	*end   = n;
-#else
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
 	// Use contiguous slab partitioning for jr/ir loops.
 	bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
-	                                   handle_edge_low, start, end );
+	                               handle_edge_low, start, end );

 	*start = *start / bf; *inc = 1;

 	if ( *end % bf ) *end = *end / bf + 1;
 	else             *end = *end / bf;

-#endif
+#else

-#if 0
-	const dim_t n_way = bli_thread_n_way( thread );
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;

-	if ( m * n / n_way > 25000 )
-	{
-		// Use contiguous slab partitioning for jr/ir loops.
-		bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
-		                                   handle_edge_low, start, end );
-		*inc = 1;
-	}
-	else
-	{
-		// Use interleaved partitioning of jr/ir loops.
-		*start = bli_thread_work_id( thread );
-		*inc   = n_way; //bli_thread_n_way( thread );
-		*end   = n;
-	}
 #endif
 }
+#endif

 #endif

--- a/sandbox/ref99/cntl/blx_gemm_cntl.c
+++ b/sandbox/ref99/cntl/blx_gemm_cntl.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -54,7 +55,28 @@ cntl_t* blx_gemmbp_cntl_create
       pack_t schema_b
     )
 {
-	void* macro_kernel_p = blx_gemm_ker_var2;
+	void* macro_kernel_fp;
+	void* packa_fp;
+	void* packb_fp;
+
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+	// Use the function pointers to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_fp = blx_gemm_ker_var2sl;
+
+	packa_fp = bli_packm_blk_var1sl;
+	packb_fp = bli_packm_blk_var1sl;
+
+#else // BLIS_ENABLE_JRIR_RR
+
+	// Use the function pointers to the macrokernels that use round-robin
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_fp = bli_gemm_ker_var2rr;
+
+	packa_fp = bli_packm_blk_var1rr;
+	packb_fp = bli_packm_blk_var1rr;
+#endif

 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
@@ -69,7 +91,7 @@ cntl_t* blx_gemmbp_cntl_create
 	(
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
+	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);

@@ -77,7 +99,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -101,7 +123,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_KR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
--- a/sandbox/ref99/vars/blx_gemm_ker_var2rr.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2rr.c
@@ -59,14 +59,14 @@ typedef void (*gemm_fp)
 // Function pointer array for datatype-specific functions.
 static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
 {
-    PASTECH2(blx_,s,gemm_ker_var2),
-    PASTECH2(blx_,c,gemm_ker_var2),
-    PASTECH2(blx_,d,gemm_ker_var2),
-    PASTECH2(blx_,z,gemm_ker_var2)
+    PASTECH2(blx_,s,gemm_ker_var2rr),
+    PASTECH2(blx_,c,gemm_ker_var2rr),
+    PASTECH2(blx_,d,gemm_ker_var2rr),
+    PASTECH2(blx_,z,gemm_ker_var2rr)
 };


-void blx_gemm_ker_var2
+void blx_gemm_ker_var2rr
     (
       obj_t*  a,
       obj_t*  b,
@@ -272,8 +272,8 @@ void PASTECH2(blx_,ch,varname) \
 	dim_t jr_inc,   ir_inc; \
 \
 	/* Determine the thread range and increment for each thrinfo_t node. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_start; j < jr_end; j += jr_inc ) \
@@ -302,11 +302,11 @@ void PASTECH2(blx_,ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -363,11 +363,11 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c,
 }

 #if 0
-GENTFUNC( float,    s, gemm_ker_var2 )
-GENTFUNC( double,   d, gemm_ker_var2 )
-GENTFUNC( scomplex, c, gemm_ker_var2 )
-GENTFUNC( dcomplex, z, gemm_ker_var2 )
+GENTFUNC( float,    s, gemm_ker_var2rr )
+GENTFUNC( double,   d, gemm_ker_var2rr )
+GENTFUNC( scomplex, c, gemm_ker_var2rr )
+GENTFUNC( dcomplex, z, gemm_ker_var2rr )
 #else
-INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
 #endif

--- a/sandbox/ref99/vars/blx_gemm_ker_var2sl.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2sl.c
@@ -0,0 +1,373 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+// Function pointer type for datatype-specific functions.
+typedef void (*gemm_fp)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+// Function pointer array for datatype-specific functions.
+static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
+{
+    PASTECH2(blx_,s,gemm_ker_var2sl),
+    PASTECH2(blx_,c,gemm_ker_var2sl),
+    PASTECH2(blx_,d,gemm_ker_var2sl),
+    PASTECH2(blx_,z,gemm_ker_var2sl)
+};
+
+
+void blx_gemm_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	gemm_fp   f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(blx_,ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+#if 0
+GENTFUNC( float,    s, gemm_ker_var2sl )
+GENTFUNC( double,   d, gemm_ker_var2sl )
+GENTFUNC( scomplex, c, gemm_ker_var2sl )
+GENTFUNC( dcomplex, z, gemm_ker_var2sl )
+#else
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
+#endif
+
--- a/sandbox/ref99/vars/blx_gemm_var.h
+++ b/sandbox/ref99/vars/blx_gemm_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -57,7 +58,8 @@ GENPROT( gemm_blk_var3 )
 GENPROT( gemm_packa )
 GENPROT( gemm_packb )

-GENPROT( gemm_ker_var2 )
+GENPROT( gemm_ker_var2sl )
+GENPROT( gemm_ker_var2rr )

 //
 // Prototype BLAS-like interfaces with void pointer operands.
@@ -85,5 +87,6 @@ void PASTECH2(blx_,ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )