diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index 44aba8ac7..b35d03f32 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -43,7 +43,6 @@ err_t bli_gemmsup_int
        obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm,
-       cntl_t* cntl,
        thrinfo_t* thread
      )
 {
@@ -89,45 +88,86 @@ err_t bli_gemmsup_int
 	                                     stor_id == BLIS_CRR );
 	const bool_t  is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
 
-	const num_t   dt       = bli_obj_dt( c );
-	const bool_t  row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const num_t   dt         = bli_obj_dt( c );
+	const bool_t  row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
 
 	const bool_t  is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
 	                                      : is_rcc_crc_ccr_ccc );
 
+	const dim_t  m           = bli_obj_length( c );
+	const dim_t  n           = bli_obj_width( c );
+	const dim_t  MR          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t  NR          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	const bool_t auto_factor = bli_rntm_auto_factor( rntm );
+	const dim_t  n_threads   = bli_rntm_num_threads( rntm );
+	bool_t       use_bp      = TRUE;
+	dim_t        jc_new;
+	dim_t        ic_new;
+
+
 	if ( is_primary )
 	{
 		// This branch handles:
 		//  - rrr rrc rcr crr for row-preferential kernels
 		//  - rcc crc ccr ccc for column-preferential kernels
 
-		const dim_t m  = bli_obj_length( c );
-		const dim_t n  = bli_obj_width( c );
-		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 		const dim_t mu = m / MR;
 		const dim_t nu = n / NR;
 
-		if ( mu >= nu )
-		//if ( m % 2 == 1 && n % 2 == 1 )
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = FALSE;
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
 		{
 			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
 			printf( "bli_l3_sup_int(): var2m primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
 			bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
 			                       alpha, a, b, beta, c,
-			                       stor_id, cntx, rntm, cntl, thread );
+			                       stor_id, cntx, rntm, thread );
 		}
-		else // if ( mu < nu )
+		else // use_pb
 		{
 			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
 			printf( "bli_l3_sup_int(): var1n primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
 			bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
 			                       alpha, a, b, beta, c,
-			                       stor_id, cntx, rntm, cntl, thread );
+			                       stor_id, cntx, rntm, thread );
+			// *requires nudging of nc up to be a multiple of mr.
 		}
 	}
 	else
@@ -136,35 +176,64 @@ err_t bli_gemmsup_int
 		//  - rrr rrc rcr crr for column-preferential kernels
 		//  - rcc crc ccr ccc for row-preferential kernels
 
-		const dim_t mt = bli_obj_width( c );
-		const dim_t nt = bli_obj_length( c );
-		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-		const dim_t mu = mt / MR;
-		const dim_t nu = nt / NR;
+		const dim_t mu = n / MR; // the n becomes m after a transposition
+		const dim_t nu = m / NR; // the m becomes n after a transposition
 
-		if ( mu >= nu )
-		//if ( mt % 2 == 1 && nt % 2 == 1 )
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = FALSE;
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
 		{
 			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
 			printf( "bli_l3_sup_int(): var2m non-primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
 			bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
 			                       alpha, a, b, beta, c,
-			                       stor_id, cntx, rntm, cntl, thread );
+			                       stor_id, cntx, rntm, thread );
 		}
-		else // if ( mu < nu )
+		else // use_pb
 		{
 			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
 			printf( "bli_l3_sup_int(): var1n non-primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
 			bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
 			                       alpha, a, b, beta, c,
-			                       stor_id, cntx, rntm, cntl, thread );
+			                       stor_id, cntx, rntm, thread );
+			// *requires nudging of mc up to be a multiple of nr.
 		}
-		// *requires nudging of mc,nc up to be a multiple of nr,mr.
 	}
 
 	// Return success so that the caller knows that we computed the solution.
diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h
index 418e905da..bdea544b2 100644
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -41,6 +41,5 @@ err_t bli_gemmsup_int
        obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm,
-       cntl_t* cntl,
        thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
index 5d31c0aa7..c875ab5a8 100644
--- a/frame/3/bli_l3_sup_packm_a.h
+++ b/frame/3/bli_l3_sup_packm_a.h
@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
      ( \
        bool_t           will_pack, \
        packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
        dim_t            mr, \
        dim_t*  restrict m_max, \
        dim_t*  restrict k_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
+       ctype*           a, inc_t           rs_a, inc_t           cs_a, \
        ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                            dim_t* restrict pd_p, inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
@@ -97,8 +96,11 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
 void PASTEMAC(ch,opname) \
      ( \
        bool_t           will_pack, \
+       packbuf_t        pack_buf_type, \
        stor3_t          stor_id, \
        trans_t          transc, \
+       dim_t            m_alloc, \
+       dim_t            k_alloc, \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
@@ -107,6 +109,7 @@ void PASTEMAC(ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
        mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
index a934718e4..8ee2cc92c 100644
--- a/frame/3/bli_l3_sup_packm_b.h
+++ b/frame/3/bli_l3_sup_packm_b.h
@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
      ( \
        bool_t           will_pack, \
        packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
        dim_t            nr, \
        dim_t*  restrict k_max, \
        dim_t*  restrict n_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
+       ctype*           b, inc_t           rs_b, inc_t           cs_b, \
        ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                            dim_t* restrict pd_p, inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
@@ -97,16 +96,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
 void PASTEMAC(ch,opname) \
      ( \
        bool_t           will_pack, \
+       packbuf_t        pack_buf_type, \
        stor3_t          stor_id, \
        trans_t          transc, \
+       dim_t            k_alloc, \
+       dim_t            n_alloc, \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
        ctype*  restrict kappa, \
-       ctype*  restrict x, inc_t           rs_x, inc_t           cs_x, \
+       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
        mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index f4d260fd8..b3593b839 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -327,3 +327,137 @@ bli_thread_obarrier( thread ); \
 		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
 		} \
 */
+
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it; \
+	dim_t           vector_len; \
+	inc_t           incc, ldc; \
+	inc_t           incp, ldp; \
+	conj_t          conjc; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	bool_t col_stored = bli_is_col_packed( schema ); \
+	/*bool_t row_stored = bli_is_row_packed( schema );*/ \
+\
+	if ( col_stored ) \
+	{ \
+		/* Prepare to pack to a column-stored matrix. */ \
+		iter_dim       = n; \
+		vector_len     = m; \
+		incc           = rs_c; \
+		ldc            = cs_c; \
+		incp           = 1; \
+		ldp            = cs_p; \
+	} \
+	else /* if ( row_stored ) */ \
+	{ \
+		/* Prepare to pack to a row-stored matrix. */ \
+		iter_dim       = m; \
+		vector_len     = n; \
+		incc           = cs_c; \
+		ldc            = rs_c; \
+		incp           = 1; \
+		ldp            = rs_p; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim; \
+\
+\
+	ctype* restrict p_begin = p_cast; \
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
+	( void )nt; \
+	( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( it = 0; it < n_iter; it += 1 ) \
+	{ \
+		ctype* restrict c_begin = c_cast + (it  )*ldc; \
+\
+		ctype* restrict c_use = c_begin; \
+		ctype* restrict p_use = p_begin; \
+\
+		{ \
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
+				( \
+				  conjc, \
+				  vector_len, \
+				  kappa_cast, \
+				  c_use, incc, \
+				  p_use, incp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+\
+		} \
+\
+		p_begin += ldp; \
+\
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
+                      p_use,         rs_p, cs_p, "%5.2f", "" ); \
+if ( !row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
+                      p_use,         rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+	} \
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 )
+
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 877601ec4..5ccdd3b76 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -58,3 +58,21 @@ void PASTEMAC(ch,varname) \
 
 INSERT_GENTPROT_BASIC0( packm_sup_var1 )
 
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     );
+
+INSERT_GENTPROT_BASIC0( packm_sup_var2 )
+
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index 657220437..2580feb77 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -360,8 +360,6 @@ void PASTEMAC(ch,varname) \
 \
 	/*
 	const inc_t jrstep_a = rs_a * MR; \
-	( void )jrstep_a; \
-	*/ \
 \
 	const inc_t irstep_c = cs_c * NR; \
 	const inc_t irstep_b = cs_b * NR; \
@@ -462,33 +460,56 @@ void PASTEMAC(ch,varname) \
 	mem_t mem_a = BLIS_MEM_INITIALIZER; \
 	mem_t mem_b = BLIS_MEM_INITIALIZER; \
 \
-	/* Prepare the packing destination buffer. If packing is not requested for
-	   matrix B, this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_a) \
-	( \
-	  packa, \
-	  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
-	  stor_id, \
-	  NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
-	  cntx, \
-	  rntm, \
-	  &mem_a, \
-	  thread  \
-	); \
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree.
+	   NOTE: These bszid_t values, and their order, match that of the bp
+	   algorithm (variant 2) because they are not used to query actual
+	   blocksizes but rather query the ways of parallelism for the various
+	   loops. For example, the 2nd loop in variant 1 partitions in the m
+	   dimension (in increments of MR), but parallelizes that m dimension
+	   with BLIS_JR_NT. The only difference is that the _packa and _packb
+	   arrays have been adjusted for the semantic difference in order in
+	   which packa and packb nodes are encountered in the thrinfo tree.
+	   That is, this panel-block algorithm partitions an NC x KC submatrix
+	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
+	   to be packed in the 3rd loop. */ \
+	/*                           5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
+	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t* restrict bszids; \
 \
-	/* Prepare the packing destination buffer. If packing is not requested for
-	   matrix B, this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_b) \
-	( \
-	  packb, \
-	  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
-	  stor_id, \
-	  KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
-	  cntx, \
-	  rntm, \
-	  &mem_b, \
-	  thread  \
-	); \
+	/* Set the bszids pointer to the correct bszids array above based on which
+	   matrices (if any) are being packed. */ \
+	if ( packa ) { if ( packb ) bszids = bszids_packab; \
+	               else         bszids = bszids_packa; } \
+	else         { if ( packb ) bszids = bszids_packb; \
+	               else         bszids = bszids_nopack; } \
+\
+	/* Determine whether we are using more than one thread. */ \
+	const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+\
+	/* Grow the thrinfo_t tree. */ \
+	bszid_t*   restrict bszids_jc = bszids; \
+	                    thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
+	const dim_t m_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   m_local % NC; \
 \
 	/* Loop over the m dimension (NC rows/columns at a time). */ \
 	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -594,29 +615,48 @@ void PASTEMAC(ch,varname) \
 \
 			ctype* a_use; \
 			inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+			/* Set the bszid_t array and thrinfo_t pointer based on whether
+			   we will be packing A. If we won't be packing A, we alias to
+			   the _pc variables so that code further down can unconditionally
+			   reference the _pa variables. Note that *if* we will be packing
+			   A, the thrinfo_t node will have already been created by a
+			   previous call to bli_thrinfo_grow(), since bszid values of
+			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+			   bszid that is a normal bszid_t value). */ \
+			bszid_t*   restrict bszids_pa; \
+			if ( packa ) { bszids_pa = &bszids_pc[1]; \
+			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
+			else         { bszids_pa = &bszids_pc[0]; \
+			               thread_pa = thread_pc; } \
 \
 			/* Determine the packing buffer and related parameters for matrix
 			   A. (If A will not be packed, then a_use will be set to point to
 			   a and the _a_use strides will be set accordingly.) Then call
 			   the packm sup variant chooser, which will call the appropriate
-			   implementation based on the schema deduced from the stor_id. */ \
+			   implementation based on the schema deduced from the stor_id.
+			   NOTE: packing matrix A in this panel-block algorithm corresponds
+			   to packing matrix B in the block-panel algorithm. */ \
 			PASTEMAC(ch,packm_sup_a) \
 			( \
 			  packa, \
-			  stor_id, \
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
+			  stor_id,                 /* a "panel of B".                  */ \
 			  BLIS_NO_TRANSPOSE, \
+			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */ \
 			  nc_cur, kc_cur, MR, \
-			  one, \
+			  &one_local, \
 			  a_pc,   rs_a,      cs_a, \
 			  &a_use, &rs_a_use, &cs_a_use, \
 			                     &ps_a_use, \
 			  cntx, \
+			  rntm, \
 			  &mem_a, \
-			  thread  \
+			  thread_pa  \
 			); \
 \
 			/* Alias a_use so that it's clear this is our current block of
-			   matrix B. */ \
+			   matrix A. */ \
 			ctype* restrict a_pc_use = a_use; \
 \
 			/* We don't need to embed the panel stride of A within the auxinfo_t
@@ -624,6 +664,20 @@ void PASTEMAC(ch,varname) \
 			   which occurs here, within the macrokernel, not within the
 			   millikernel. */ \
 			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
+\
+			/* Grow the thrinfo_t tree. */ \
+			bszid_t*   restrict bszids_ic = &bszids_pa[1]; \
+			                    thread_ic = bli_thrinfo_sub_node( thread_pa ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
+			const dim_t n_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   n_local % MC; \
 \
 			/* Loop over the n dimension (MC rows at a time). */ \
 			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -712,6 +766,20 @@ void PASTEMAC(ch,varname) \
 \
 				ctype* b_use; \
 				inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+				/* Set the bszid_t array and thrinfo_t pointer based on whether
+				   we will be packing A. If we won't be packing A, we alias to
+				   the _pc variables so that code further down can unconditionally
+				   reference the _pa variables. Note that *if* we will be packing
+				   A, the thrinfo_t node will have already been created by a
+				   previous call to bli_thrinfo_grow(), since bszid values of
+				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+				   bszid that is a normal bszid_t value). */ \
+				bszid_t*   restrict bszids_pb; \
+				if ( packb ) { bszids_pb = &bszids_ic[1]; \
+							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
+				else         { bszids_pb = &bszids_ic[0]; \
+							   thread_pb = thread_ic; } \
 \
 				/* Determine the packing buffer and related parameters for matrix
 				   B. (If B will not be packed, then b_use will be set to point to
@@ -723,16 +791,19 @@ void PASTEMAC(ch,varname) \
 				PASTEMAC(ch,packm_sup_b) \
 				( \
 				  packb, \
-				  stor_id, \
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
+				  stor_id,                 /* a "block of A".                  */ \
 				  BLIS_NO_TRANSPOSE, \
+				  KC,     MC,       /* This "block of A" is (at most) KC x MC. */ \
 				  kc_cur, mc_cur, NR, \
-				  one, \
+				  &one_local, \
 				  b_ic,   rs_b,      cs_b, \
 				  &b_use, &rs_b_use, &cs_b_use, \
 				                     &ps_b_use, \
 				  cntx, \
+				  rntm, \
 				  &mem_b, \
-				  thread  \
+				  thread_pb  \
 				); \
 \
 				/* Alias b_use so that it's clear this is our current block of
@@ -744,6 +815,29 @@ void PASTEMAC(ch,varname) \
 				   micropanels of B. */ \
 				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
 \
+				/* Grow the thrinfo_t tree. */ \
+				bszid_t*   restrict bszids_jr = &bszids_pb[1]; \
+				                    thread_jr = bli_thrinfo_sub_node( thread_pb ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
+				dim_t jr_left =   nc_cur % MR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* An optimization: allow the last jr iteration to contain up to MRE
+				   rows of C and A. (If MRE > MR, the mkernel has agreed to handle
+				   these cases.) Note that this prevents us from declaring jr_iter and
+				   jr_left as const. NOTE: We forgo this optimization when packing A
+				   since packing an extended edge case is not yet supported. */ \
+				if ( !packa && !is_mt ) \
+				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
+				{ \
+					jr_iter--; jr_left += MR; \
+				} \
 \
 				/* Loop over the m dimension (NR columns at a time). */ \
 				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -1194,33 +1288,45 @@ void PASTEMAC(ch,varname) \
 	mem_t mem_a = BLIS_MEM_INITIALIZER; \
 	mem_t mem_b = BLIS_MEM_INITIALIZER; \
 \
-	/* Prepare the packing destination buffer. If packing is not requested for
-	   matrix A, this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_a) \
-	( \
-	  packa, \
-	  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
-	  stor_id, \
-	  MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
-	  cntx, \
-	  rntm, \
-	  &mem_a, \
-	  thread  \
-	); \
+	/* Define an array of bszid_t ids, which will act as our substitute for
+	   the cntl_t tree. */ \
+	/*                           5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */ \
+	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
+	bszid_t* restrict bszids; \
 \
-	/* Prepare the packing destination buffer. If packing is not requested for
-	   matrix B, this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_b) \
-	( \
-	  packb, \
-	  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
-	  stor_id, \
-	  KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
-	  cntx, \
-	  rntm, \
-	  &mem_b, \
-	  thread  \
-	); \
+	/* Set the bszids pointer to the correct bszids array above based on which
+	   matrices (if any) are being packed. */ \
+	if ( packa ) { if ( packb ) bszids = bszids_packab; \
+	               else         bszids = bszids_packa; } \
+	else         { if ( packb ) bszids = bszids_packb; \
+	               else         bszids = bszids_nopack; } \
+\
+	/* Determine whether we are using more than one thread. */ \
+	const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+\
+	/* Grow the thrinfo_t tree. */ \
+	bszid_t*   restrict bszids_jc = bszids; \
+	                    thread_jc = thread; \
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+\
+	/* Compute the JC loop thread range for the current thread. */ \
+	dim_t jc_start, jc_end; \
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
+	const dim_t n_local = jc_end - jc_start; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n_local % NC; \
 \
 	/* Loop over the n dimension (NC rows/columns at a time). */ \
 	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -1324,6 +1430,20 @@ void PASTEMAC(ch,varname) \
 \
 			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Set the bszid_t array and thrinfo_t pointer based on whether
+			   we will be packing B. If we won't be packing B, we alias to
+			   the _pc variables so that code further down can unconditionally
+			   reference the _pb variables. Note that *if* we will be packing
+			   B, the thrinfo_t node will have already been created by a
+			   previous call to bli_thrinfo_grow(), since bszid values of
+			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+			   bszid that is a normal bszid_t value). */ \
+			bszid_t*   restrict bszids_pb; \
+			if ( packb ) { bszids_pb = &bszids_pc[1]; \
+			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
+			else         { bszids_pb = &bszids_pc[0]; \
+			               thread_pb = thread_pc; } \
 \
 			/* Determine the packing buffer and related parameters for matrix
 			   B. (If B will not be packed, then a_use will be set to point to
@@ -1333,16 +1453,19 @@ void PASTEMAC(ch,varname) \
 			PASTEMAC(ch,packm_sup_b) \
 			( \
 			  packb, \
-			  stor_id, \
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
+			  stor_id,                 /* a "panel of B."                  */ \
 			  BLIS_NO_TRANSPOSE, \
+			  KC,     NC,       /* This "panel of B" is (at most) KC x NC. */ \
 			  kc_cur, nc_cur, NR, \
-			  one, \
+			  &one_local, \
 			  b_pc,   rs_b,      cs_b, \
 			  &b_use, &rs_b_use, &cs_b_use, \
 			                     &ps_b_use, \
 			  cntx, \
+			  rntm, \
 			  &mem_b, \
-			  thread  \
+			  thread_pb  \
 			); \
 \
 			/* Alias a_use so that it's clear this is our current block of
@@ -1354,6 +1477,20 @@ void PASTEMAC(ch,varname) \
 			   which occurs here, within the macrokernel, not within the
 			   millikernel. */ \
 			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
+\
+			/* Grow the thrinfo_t tree. */ \
+			bszid_t*   restrict bszids_ic = &bszids_pb[1]; \
+			                    thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
+\
+			/* Compute the IC loop thread range for the current thread. */ \
+			dim_t ic_start, ic_end; \
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
+			const dim_t m_local = ic_end - ic_start; \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m_local % MC; \
 \
 			/* Loop over the m dimension (MC rows at a time). */ \
 			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -1440,6 +1577,20 @@ void PASTEMAC(ch,varname) \
 \
 				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Set the bszid_t array and thrinfo_t pointer based on whether
+				   we will be packing B. If we won't be packing A, we alias to
+				   the _ic variables so that code further down can unconditionally
+				   reference the _pa variables. Note that *if* we will be packing
+				   A, the thrinfo_t node will have already been created by a
+				   previous call to bli_thrinfo_grow(), since bszid values of
+				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
+				   bszid that is a normal bszid_t value). */ \
+				bszid_t*   restrict bszids_pa; \
+				if ( packa ) { bszids_pa = &bszids_ic[1]; \
+							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
+				else         { bszids_pa = &bszids_ic[0]; \
+							   thread_pa = thread_ic; } \
 \
 				/* Determine the packing buffer and related parameters for matrix
 				   A. (If A will not be packed, then a_use will be set to point to
@@ -1449,16 +1600,19 @@ void PASTEMAC(ch,varname) \
 				PASTEMAC(ch,packm_sup_a) \
 				( \
 				  packa, \
-				  stor_id, \
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
+				  stor_id,                 /* a "block of A."                  */ \
 				  BLIS_NO_TRANSPOSE, \
+				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */ \
 				  mc_cur, kc_cur, MR, \
-				  one, \
+				  &one_local, \
 				  a_ic,   rs_a,      cs_a, \
 				  &a_use, &rs_a_use, &cs_a_use, \
 				                     &ps_a_use, \
 				  cntx, \
+				  rntm, \
 				  &mem_a, \
-				  thread  \
+				  thread_pa  \
 				); \
 \
 				/* Alias a_use so that it's clear this is our current block of
@@ -1469,6 +1623,30 @@ void PASTEMAC(ch,varname) \
 				   millikernel will query and use this to iterate through
 				   micropanels of A (if needed). */ \
 				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
+\
+				/* Grow the thrinfo_t tree. */ \
+				bszid_t*   restrict bszids_jr = &bszids_pa[1]; \
+				                    thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Compute the JR loop thread range for the current thread. */ \
+				dim_t jr_start, jr_end; \
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				/* An optimization: allow the last jr iteration to contain up to NRE
+				   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
+				   these cases.) Note that this prevents us from declaring jr_iter and
+				   jr_left as const. NOTE: We forgo this optimization when packing B
+				   since packing an extended edge case is not yet supported. */ \
+				if ( !packb && !is_mt ) \
+				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
+				{ \
+					jr_iter--; jr_left += NR; \
+				} \
 \
 				/* Loop over the n dimension (NR columns at a time). */ \
 				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index 3ff4eb3a0..0bf3ad854 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -125,7 +125,7 @@ void bli_l3_thread_decorator
 
 		// Alias thread-local copies of A, B, and C. These will be the objects
 		// we pass down the algorithmic function stack. Making thread-local
-		// alaises is highly recommended in case a thread needs to change any
+		// aliases is highly recommended in case a thread needs to change any
 		// of the properties of an object without affecting other threads'
 		// objects.
 		bli_obj_alias_to( a, &a_t );
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
index d0b48d7e2..a2c453062 100644
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -96,7 +96,7 @@ void* bli_l3_thread_entry( void* data_void )
 
 	// Alias thread-local copies of A, B, and C. These will be the objects
 	// we pass down the algorithmic function stack. Making thread-local
-	// alaises is highly recommended in case a thread needs to change any
+	// aliases is highly recommended in case a thread needs to change any
 	// of the properties of an object without affecting other threads'
 	// objects.
 	bli_obj_alias_to( a, &a_t );
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h
index 01dfb8f6e..a001e5b74 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -48,7 +48,6 @@ typedef err_t (*l3supint_t)
        obj_t*     c,
        cntx_t*    cntx,
        rntm_t*    rntm,
-       cntl_t*    cntl,
        thrinfo_t* thread
      );
 
@@ -57,8 +56,6 @@ err_t bli_l3_sup_thread_decorator
      (
        l3supint_t func,
        opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
        obj_t*     alpha,
        obj_t*     a,
        obj_t*     b,
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
index 7808c1333..4baacd1ea 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -40,16 +40,14 @@
 // Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
 // pthreads version, so that when building Windows DLLs (with OpenMP enabled
 // or no multithreading) we don't risk having an unresolved symbol.
-//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
-
+void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
 
+//#define PRINT_THRINFO
 
 err_t bli_l3_sup_thread_decorator
      (
        l3supint_t func,
        opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
        obj_t*     alpha,
        obj_t*     a,
        obj_t*     b,
@@ -59,36 +57,8 @@ err_t bli_l3_sup_thread_decorator
        rntm_t*    rntm
      )
 {
-#if 0
-
-	return
-	bli_gemmsup_int
-	(
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm,
-	  0
-	);
-
-#else
-
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	//pack_t schema_a = bli_obj_pack_schema( a );
-	//pack_t schema_b = bli_obj_pack_schema( b );
-	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
+	// Query the total number of threads from the rntm_t object.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
 	// NOTE: The sba was initialized in bli_init().
 
@@ -99,55 +69,45 @@ err_t bli_l3_sup_thread_decorator
 	array_t* restrict array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we can create the global comm below.
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
 	bli_sba_rntm_set_pool( 0, array, rntm );
 
-	// Set the packing block allocator field of the rntm.
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
 	bli_membrk_rntm_set_membrk( rntm );
 
-#if 0
 	// Allcoate a global communicator for the root thrinfo_t structures.
 	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
 
 
+	_Pragma( "omp parallel num_threads(n_threads)" )
 	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		// Create a thread-local copy of the master thread's rntm_t. This is
+		// necessary since we want each thread to be able to track its own
+		// small block pool_t as it executes down the function stack.
+		rntm_t           rntm_l = *rntm;
+		rntm_t* restrict rntm_p = &rntm_l;
 
-		cntl_t*    cntl_use = NULL;
-		//thrinfo_t* thread   = NULL;
-		thrinfo_t* thread   = &BLIS_PACKM_SINGLE_THREADED;
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
 
-		const dim_t tid = 0;
+		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
+		// NOTE: This calls the same function used for the conventional/large
+		// code path.
+		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
 
 		// Use the thread id to access the appropriate pool_t* within the
 		// array_t, and use it to set the sba_pool field within the rntm_t.
 		// If the pool_t* element within the array_t is NULL, it will first
 		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+		bli_sba_rntm_set_pool( tid, array, rntm_p );
 
-		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
-		// need to alias objects for A, B, and C since they were already aliased
-		// in bli_*_front(). However, we may add aliasing here in the future so
-		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
-		// consistently providing local aliases, we can then eliminate aliasing
-		// elsewhere.
-
-		// Create a default control tree for the operation, if needed.
-		//bli_l3_cntl_create_if( family, schema_a, schema_b,
-		//                       a, b, c, rntm_p, cntl, &cntl_use );
-#if 0
-		cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
+		thrinfo_t* thread = NULL;
 
 		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-#endif
-
-		( void )tid;
+		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
 
 		func
 		(
@@ -158,23 +118,16 @@ err_t bli_l3_sup_thread_decorator
 		  c,
 		  cntx,
 		  rntm_p,
-		  cntl_use,
 		  thread
 		);
 
-#if 0
-		// Free the thread's local control tree.
-		//bli_l3_cntl_free( rntm_p, cntl_use, thread );
-		bli_gemm_cntl_free( rntm_p, cntl_use, thread );
-
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-#endif
+		bli_l3_sup_thrinfo_free( rntm_p, thread );
 	}
 
 	// We shouldn't free the global communicator since it was already freed
 	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
+	// (called from the thread entry function).
 
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
@@ -182,8 +135,6 @@ err_t bli_l3_sup_thread_decorator
 	bli_sba_checkin_array( array );
 
 	return BLIS_SUCCESS;
-
-#endif
 }
 
 #endif
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
index 7cbf65381..5b23bc86b 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -37,12 +37,82 @@
 
 #ifdef BLIS_ENABLE_PTHREADS
 
+// A data structure to assist in passing operands to additional threads.
+typedef struct thread_data
+{
+	l3supint_t func;
+	opid_t     family;
+	obj_t*     alpha;
+	obj_t*     a;
+	obj_t*     b;
+	obj_t*     beta;
+	obj_t*     c;
+	cntx_t*    cntx;
+	rntm_t*    rntm;
+	dim_t      tid;
+	thrcomm_t* gl_comm;
+	array_t*   array;
+} thread_data_t;
+
+// Entry point for additional threads
+void* bli_l3_sup_thread_entry( void* data_void )
+{
+	thread_data_t* data     = data_void;
+
+	l3supint_t     func     = data->func;
+	opid_t         family   = data->family;
+	obj_t*         alpha    = data->alpha;
+	obj_t*         a        = data->a;
+	obj_t*         b        = data->b;
+	obj_t*         beta     = data->beta;
+	obj_t*         c        = data->c;
+	cntx_t*        cntx     = data->cntx;
+	rntm_t*        rntm     = data->rntm;
+	dim_t          tid      = data->tid;
+	array_t*       array    = data->array;
+	thrcomm_t*     gl_comm  = data->gl_comm;
+
+	( void )family;
+
+	// Create a thread-local copy of the master thread's rntm_t. This is
+	// necessary since we want each thread to be able to track its own
+	// small block pool_t as it executes down the function stack.
+	rntm_t           rntm_l = *rntm;
+	rntm_t* restrict rntm_p = &rntm_l;
+
+	// Use the thread id to access the appropriate pool_t* within the
+	// array_t, and use it to set the sba_pool field within the rntm_t.
+	// If the pool_t* element within the array_t is NULL, it will first
+	// be allocated/initialized.
+	bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+	thrinfo_t* thread = NULL;
+
+	// Create the root node of the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+
+	func
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm_p,
+	  thread
+	);
+
+	// Free the current thread's thrinfo_t structure.
+	bli_l3_sup_thrinfo_free( rntm_p, thread );
+
+	return NULL;
+}
+
 err_t bli_l3_sup_thread_decorator
      (
        l3supint_t func,
        opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
        obj_t*     alpha,
        obj_t*     a,
        obj_t*     b,
@@ -52,36 +122,8 @@ err_t bli_l3_sup_thread_decorator
        rntm_t*    rntm
      )
 {
-#if 0
-
-	return
-	bli_gemmsup_int
-	(
-	  alpha,
-	  a,
-	  b,
-	  beta,
-	  c,
-	  cntx,
-	  rntm,
-	  0
-	);
-
-#else
-
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	//pack_t schema_a = bli_obj_pack_schema( a );
-	//pack_t schema_b = bli_obj_pack_schema( b );
-	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
+	// Query the total number of threads from the context.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
 	// NOTE: The sba was initialized in bli_init().
 
@@ -92,91 +134,82 @@ err_t bli_l3_sup_thread_decorator
 	array_t* restrict array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we can create the global comm below.
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
 	bli_sba_rntm_set_pool( 0, array, rntm );
 
-	// Set the packing block allocator field of the rntm.
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
 	bli_membrk_rntm_set_membrk( rntm );
 
-#if 0
-	// Allcoate a global communicator for the root thrinfo_t structures.
+	// Allocate a global communicator for the root thrinfo_t structures.
 	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
 
+	// Allocate an array of pthread objects and auxiliary data structs to pass
+	// to the thread entry functions.
 
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
+
+	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
+	// can spawn all other threads before proceeding with its own computation.
+	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
 	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		// Set up thread data for additional threads (beyond thread 0).
+		datas[tid].func     = func;
+		datas[tid].family   = family;
+		datas[tid].alpha    = alpha;
+		datas[tid].a        = a;
+		datas[tid].b        = b;
+		datas[tid].beta     = beta;
+		datas[tid].c        = c;
+		datas[tid].cntx     = cntx;
+		datas[tid].rntm     = rntm;
+		datas[tid].tid      = tid;
+		datas[tid].gl_comm  = gl_comm;
+		datas[tid].array    = array;
 
-		cntl_t*    cntl_use = NULL;
-		//thrinfo_t* thread   = NULL;
-		thrinfo_t* thread   = &BLIS_PACKM_SINGLE_THREADED;
-
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
-		// need to alias objects for A, B, and C since they were already aliased
-		// in bli_*_front(). However, we may add aliasing here in the future so
-		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
-		// consistently providing local aliases, we can then eliminate aliasing
-		// elsewhere.
-
-		// Create a default control tree for the operation, if needed.
-		//bli_l3_cntl_create_if( family, schema_a, schema_b,
-		//                       a, b, c, rntm_p, cntl, &cntl_use );
-#if 0
-		cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-#endif
-
-		( void )tid;
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-
-#if 0
-		// Free the thread's local control tree.
-		//bli_l3_cntl_free( rntm_p, cntl_use, thread );
-		bli_gemm_cntl_free( rntm_p, cntl_use, thread );
-
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-#endif
+		// Spawn additional threads for ids greater than 1.
+		if ( tid != 0 )
+			bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] );
+		else
+			bli_l3_sup_thread_entry( ( void* )(&datas[0]) );
 	}
 
 	// We shouldn't free the global communicator since it was already freed
 	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
+	// (called from the thread entry function).
+
+	// Thread 0 waits for additional threads to finish.
+	for ( dim_t tid = 1; tid < n_threads; tid++ )
+	{
+		bli_pthread_join( pthreads[tid], NULL );
+	}
 
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
 	bli_sba_checkin_array( array );
 
-	return BLIS_SUCCESS;
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( pthreads );
 
-#endif
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( datas );
+
+	return BLIS_SUCCESS;
 }
 
 #endif
diff --git a/test/supmt/Makefile b/test/supmt/Makefile
new file mode 100644
index 000000000..2ed93565d
--- /dev/null
+++ b/test/supmt/Makefile
@@ -0,0 +1,580 @@
+#!/bin/bash
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2019, Advanced Micro Devices, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# Makefile
+#
+# Field G. Van Zee
+#
+# Makefile for standalone BLIS test drivers.
+#
+
+#
+# --- Makefile PHONY target definitions ----------------------------------------
+#
+
+.PHONY: all all-st all-mt \
+        blis blis-st blis-mt \
+        clean cleanx
+
+
+
+#
+# --- Determine makefile fragment location -------------------------------------
+#
+
+# Comments:
+# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
+# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
+#   the second case because CONFIG_NAME is not yet set.
+ifneq ($(strip $(BLIS_INSTALL_PATH)),)
+LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
+INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
+SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
+else
+DIST_PATH  := ../..
+LIB_PATH    = ../../lib/$(CONFIG_NAME)
+INC_PATH    = ../../include/$(CONFIG_NAME)
+SHARE_PATH := ../..
+endif
+
+
+
+#
+# --- Include common makefile definitions --------------------------------------
+#
+
+# Include the common makefile fragment.
+-include $(SHARE_PATH)/common.mk
+
+
+
+#
+# --- BLAS and LAPACK implementations ------------------------------------------
+#
+
+# BLIS library and header path. This is simply wherever it was installed.
+#BLIS_LIB_PATH  := $(INSTALL_PREFIX)/lib
+#BLIS_INC_PATH  := $(INSTALL_PREFIX)/include/blis
+
+# BLIS library.
+#BLIS_LIB       := $(BLIS_LIB_PATH)/libblis.a
+
+# BLAS library path(s). This is where the BLAS libraries reside.
+HOME_LIB_PATH  := $(HOME)/flame/lib
+MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
+
+# netlib BLAS
+NETLIB_LIB     := $(HOME_LIB_PATH)/libblas.a
+
+# OpenBLAS
+OPENBLAS_LIB   := $(HOME_LIB_PATH)/libopenblas.a
+OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
+
+# BLASFEO
+BLASFEO_LIB    := $(HOME_LIB_PATH)/libblasfeo.a
+
+# libxsmm
+LIBXSMM_LIB    := $(HOME_LIB_PATH)/libxsmm.a -ldl \
+                  $(NETLIB_LIB) -lgfortran
+
+# ATLAS
+ATLAS_LIB      := $(HOME_LIB_PATH)/libf77blas.a \
+                  $(HOME_LIB_PATH)/libatlas.a
+
+# Eigen
+EIGEN_INC      := $(HOME)/flame/eigen/include/eigen3
+EIGEN_LIB      := $(HOME_LIB_PATH)/libeigen_blas_static.a
+EIGENP_LIB     := $(EIGEN_LIB)
+
+# MKL
+MKL_LIB        := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_sequential \
+                  -lpthread -lm -ldl
+MKLP_LIB       := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_gnu_thread \
+                  -lpthread -lm -ldl -fopenmp
+                  #-L$(ICC_LIB_PATH) \
+                  #-lgomp
+
+VENDOR_LIB     := $(MKL_LIB)
+VENDORP_LIB    := $(MKLP_LIB)
+
+
+#
+# --- Problem size definitions -------------------------------------------------
+#
+
+# Single core
+PS_BEGIN := 4
+PS_MAX   := 800
+PS_INC   := 4
+
+# Multicore
+P1_BEGIN := 8 
+P1_MAX   := 1600
+P1_INC   := 8 
+
+
+#
+# --- General build definitions ------------------------------------------------
+#
+
+TEST_SRC_PATH  := .
+TEST_OBJ_PATH  := .
+
+# Gather all local object files.
+TEST_OBJS      := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
+                                    $(TEST_OBJ_PATH)/%.o, \
+                                    $(wildcard $(TEST_SRC_PATH)/*.c)))
+
+# Override the value of CINCFLAGS so that the value of CFLAGS returned by
+# get-frame-cflags-for() is not cluttered up with include paths needed only
+# while building BLIS.
+CINCFLAGS      := -I$(INC_PATH)
+
+# Use the "framework" CFLAGS for the configuration family.
+CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
+
+# Add local header paths to CFLAGS.
+CFLAGS         += -I$(TEST_SRC_PATH)
+
+# Locate the libblis library to which we will link.
+LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+
+# Define a set of CFLAGS for use with C++ and Eigen.
+CXXFLAGS       := $(subst -std=c99,-std=c++11,$(CFLAGS))
+CXXFLAGS       += -I$(EIGEN_INC)
+
+# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
+CXXFLAGS_ST    := -march=native $(subst -fopenmp,,$(CXXFLAGS))
+CXXFLAGS_MT    := -march=native $(CXXFLAGS)
+
+# Single or multithreaded string
+STR_ST   := -DTHR_STR=\"st\"
+STR_MT   := -DTHR_STR=\"mt\"
+
+# Number of trials per problem size.
+N_TRIALS := -DN_TRIALS=3
+
+# Problem size specification
+PDEF_ST  := -DP_BEGIN=$(PS_BEGIN) \
+            -DP_MAX=$(PS_MAX) \
+            -DP_INC=$(PS_INC)
+
+PDEF_MT  := -DP_BEGIN=$(P1_BEGIN) \
+            -DP_MAX=$(P1_MAX) \
+            -DP_INC=$(P1_INC)
+
+ifeq ($(E),1)
+ERRCHK := -DERROR_CHECK
+else
+ERRCHK := -DNO_ERROR_CHECK
+endif
+
+# Enumerate possible datatypes and computation precisions.
+#dts := s d c z
+DTS := d
+
+TRANS := n_n \
+         n_t \
+         t_n \
+         t_t
+
+# While BLIS supports all combinations of row and column storage for matrices
+# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently
+# support only "all row-storage" or "all column-storage". Thus, we disable the
+# building of those other drivers so that compilation/linking completes sooner.
+#STORS := r_r_r \
+#         r_r_c \
+#         r_c_r \
+#         r_c_c \
+#         c_r_r \
+#         c_r_c \
+#         c_c_r \
+#         c_c_c
+STORS := r_r_r \
+         c_c_c
+
+
+SHAPES := l_l_s \
+          l_s_l \
+          s_l_l \
+          s_s_l \
+          s_l_s \
+          l_s_s \
+          l_l_l
+
+SMS := 6
+SNS := 8
+SKS := 10
+
+
+#
+# --- Function definitions -----------------------------------------------------
+#
+
+# A function to strip the underscores from a list of strings.
+stripu = $(subst _,,$(1))
+
+# Various functions that help us construct the datatype combinations and then
+# extract the needed datatype strings and C preprocessor define flags.
+get-1of2 = $(word 1,$(subst _, ,$(1)))
+get-2of2 = $(word 2,$(subst _, ,$(1)))
+
+get-1of3 = $(word 1,$(subst _, ,$(1)))
+get-2of3 = $(word 2,$(subst _, ,$(1)))
+get-3of3 = $(word 3,$(subst _, ,$(1)))
+
+# Datatype defs.
+get-dt-cpp = $(strip \
+             $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT    -DIS_FLOAT,\
+             $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE   -DIS_DOUBLE,\
+             $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
+                                       -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
+
+# Transpose defs.
+get-tra-defs-a = $(strip $(subst n,-DTRANSA=BLIS_NO_TRANSPOSE -DA_NOTRANS, \
+                         $(subst t,-DTRANSA=BLIS_TRANSPOSE    -DA_TRANS,$(call get-1of2,$(1)))))
+get-tra-defs-b = $(strip $(subst n,-DTRANSB=BLIS_NO_TRANSPOSE -DB_NOTRANS, \
+                         $(subst t,-DTRANSB=BLIS_TRANSPOSE    -DB_TRANS,$(call get-2of2,$(1)))))
+get-tra-defs = $(call get-tra-defs-a,$(1)) $(call get-tra-defs-b,$(1))
+
+# Storage defs.
+get-sto-uch-a = $(strip $(subst r,R, \
+                        $(subst c,C,$(call get-1of3,$(1)))))
+get-sto-uch-b = $(strip $(subst r,R, \
+                        $(subst c,C,$(call get-2of3,$(1)))))
+get-sto-uch-c = $(strip $(subst r,R, \
+                        $(subst c,C,$(call get-3of3,$(1)))))
+get-sto-defs = $(strip \
+                 -DSTOR3=BLIS_$(call get-sto-uch-a,$(1))$(call get-sto-uch-b,$(1))$(call get-sto-uch-c,$(1)) \
+                 -DA_STOR_$(call get-sto-uch-a,$(1)) \
+                 -DB_STOR_$(call get-sto-uch-b,$(1)) \
+                 -DC_STOR_$(call get-sto-uch-c,$(1)))
+
+# Dimension defs.
+get-shape-defs-cm = $(if $(findstring l,$(1)),-DM_DIM=-1,-DM_DIM=$(2))
+get-shape-defs-cn = $(if $(findstring l,$(1)),-DN_DIM=-1,-DN_DIM=$(2))
+get-shape-defs-ck = $(if $(findstring l,$(1)),-DK_DIM=-1,-DK_DIM=$(2))
+get-shape-defs-m  = $(call get-shape-defs-cm,$(call get-1of3,$(1)),$(2))
+get-shape-defs-n  = $(call get-shape-defs-cn,$(call get-2of3,$(1)),$(2))
+get-shape-defs-k  = $(call get-shape-defs-ck,$(call get-3of3,$(1)),$(2))
+
+# arguments: 1: shape (w/ underscores)  2: smallm  3: smalln  4: smallk
+get-shape-defs    = $(strip $(call get-shape-defs-m,$(1),$(2)) \
+                            $(call get-shape-defs-n,$(1),$(3)) \
+                            $(call get-shape-defs-k,$(1),$(4)))
+
+#$(error l_l_s 6 8 4 = $(call get-shape-defs,l_l_s,6,8,4))
+
+# Shape-dimension string.
+get-shape-str-ch  = $(if $(findstring l,$(1)),p,$(2))
+get-shape-str-m   = $(call get-shape-str-ch,$(call get-1of3,$(1)),$(2))
+get-shape-str-n   = $(call get-shape-str-ch,$(call get-2of3,$(1)),$(2))
+get-shape-str-k   = $(call get-shape-str-ch,$(call get-3of3,$(1)),$(2))
+
+# arguments: 1: shape (w/ underscores)  2: smallm  3: smalln  4: smallk
+get-shape-dim-str = m$(call get-shape-str-m,$(1),$(2))n$(call get-shape-str-n,$(1),$(3))k$(call get-shape-str-k,$(1),$(4))
+
+# Implementation defs.
+# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags.
+get-imp-defs = $(strip $(subst  blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \
+                       $(subst blislpab,-DSTR=\"$(1)\" -DBLIS, \
+                       $(subst    eigen,-DSTR=\"$(1)\" -DEIGEN, \
+                       $(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \
+                       $(subst  blasfeo,-DSTR=\"$(1)\" -DCBLAS, \
+                       $(subst  libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \
+                       $(subst   vendor,-DSTR=\"$(1)\" -DCBLAS,$(1)))))))))
+
+TRANS0  = $(call stripu,$(TRANS))
+STORS0  = $(call stripu,$(STORS))
+
+# Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices.
+# Also, limit libxsmm to using all column-stored matrices since it does not offer
+# CBLAS interfaces.
+BSTORS0 = rrr ccc
+ESTORS0 = rrr ccc
+XSTORS0 = ccc
+
+
+#
+# --- Object and binary file definitons ----------------------------------------
+#
+
+get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o)))))))
+
+# Build a list of object files and binaries for each single-threaded
+# implementation using the get-st-objs() function defined above.
+BLISSUP_ST_OBJS  := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
+BLISSUP_ST_BINS  := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS))
+
+BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
+BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS))
+
+EIGEN_ST_OBJS    := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
+EIGEN_ST_BINS    := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
+
+OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
+OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
+
+BLASFEO_ST_OBJS  := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo)
+BLASFEO_ST_BINS  := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS))
+
+LIBXSMM_ST_OBJS  := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm)
+LIBXSMM_ST_BINS  := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS))
+
+VENDOR_ST_OBJS   := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
+VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
+
+# Mark the object files as intermediate so that make will remove them
+# automatically after building the binaries on which they depend.
+.INTERMEDIATE: $(BLISSUP_ST_OBJS) \
+               $(BLISLPAB_ST_OBJS) \
+               $(EIGEN_ST_OBJS) \
+               $(OPENBLAS_ST_OBJS) \
+               $(BLASFEO_ST_OBJS) \
+               $(LIBXSMM_ST_OBJS) \
+               $(VENDOR_ST_OBJS)
+
+get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o)))))))
+
+# Build a list of object files and binaries for each multithreaded
+# implementation using the get-st-objs() function defined above.
+BLISSUP_MT_OBJS  := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
+BLISSUP_MT_BINS  := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS))
+
+BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
+BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS))
+
+EIGEN_MT_OBJS    := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
+EIGEN_MT_BINS    := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS))
+
+OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
+OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS))
+
+VENDOR_MT_OBJS   := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
+VENDOR_MT_BINS   := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS))
+
+#$(error "objs = $(EIGEN_ST_BINS)" )
+
+# Mark the object files as intermediate so that make will remove them
+# automatically after building the binaries on which they depend.
+.INTERMEDIATE: $(BLISSUP_MT_OBJS) \
+               $(BLISLPAB_MT_OBJS) \
+               $(EIGEN_MT_OBJS) \
+               $(OPENBLAS_MT_OBJS) \
+               $(VENDOR_MT_OBJS)
+
+
+#
+# --- Targets/rules ------------------------------------------------------------
+#
+
+all:         st
+
+blis:        blissup-st blislpab-st
+
+blissup:     blissup-st
+blislpab:    blislpab-st
+eigen:       eigen-st
+openblas:    openblas-st
+blasfeo:     blasfeo-st
+libxsmm:     libxsmm-st
+vendor:      vendor-st
+
+st:          blissup-st blislpab-st \
+             eigen-st openblas-st blasfeo-st libxsmm-st vendor-st
+
+blissup-st:  $(BLISSUP_ST_BINS)
+blislpab-st: $(BLISLPAB_ST_BINS)
+eigen-st:    $(EIGEN_ST_BINS)
+openblas-st: $(OPENBLAS_ST_BINS)
+blasfeo-st:  $(BLASFEO_ST_BINS)
+libxsmm-st:  $(LIBXSMM_ST_BINS)
+vendor-st:   $(VENDOR_ST_BINS)
+
+mt:          blissup-mt blislpab-mt \
+             eigen-mt openblas-mt vendor-mt
+
+blissup-mt:  $(BLISSUP_MT_BINS)
+blislpab-mt: $(BLISLPAB_MT_BINS)
+eigen-mt:    $(EIGEN_MT_BINS)
+openblas-mt: $(OPENBLAS_MT_BINS)
+vendor-mt:   $(VENDOR_MT_BINS)
+
+
+
+# --Object file rules --
+
+# Define the implementations for which we will instantiate compilation rules.
+BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor
+BIMPLS_MT := blissup blislpab openblas                 vendor
+EIMPLS    := eigen
+
+#      1     2  3   4 567  8
+# test_dgemm_nn_rrr_mpn6kp_blissup_st.x
+
+# Define the function that will be used to instantiate compilation rules
+# for the various single-threaded implementations.
+define make-st-rule
+test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
+	$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
+endef
+
+# Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS
+# implementation.
+$(foreach dt,$(DTS), \
+$(foreach tr,$(TRANS), \
+$(foreach st,$(STORS), \
+$(foreach sh,$(SHAPES), \
+$(foreach sm,$(SMS), \
+$(foreach sn,$(SNS), \
+$(foreach sk,$(SKS), \
+$(foreach impl,$(BIMPLS_ST), \
+$(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
+
+# Define the function that will be used to instantiate compilation rules
+# for the various multithreaded implementations.
+define make-mt-rule
+test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
+	$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
+endef
+
+# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS
+# implementation.
+$(foreach dt,$(DTS), \
+$(foreach tr,$(TRANS), \
+$(foreach st,$(STORS), \
+$(foreach sh,$(SHAPES), \
+$(foreach sm,$(SMS), \
+$(foreach sn,$(SNS), \
+$(foreach sk,$(SKS), \
+$(foreach impl,$(BIMPLS_MT), \
+$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
+
+# Define the function that will be used to instantiate compilation rules
+# for the single-threaded Eigen implementation.
+define make-eigst-rule
+test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
+	$(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
+endef
+
+# Instantiate the rule function make-st-rule() for each Eigen implementation.
+$(foreach dt,$(DTS), \
+$(foreach tr,$(TRANS), \
+$(foreach st,$(STORS), \
+$(foreach sh,$(SHAPES), \
+$(foreach sm,$(SMS), \
+$(foreach sn,$(SNS), \
+$(foreach sk,$(SKS), \
+$(foreach impl,$(EIMPLS), \
+$(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
+
+# Define the function that will be used to instantiate compilation rules
+# for the multithreaded Eigen implementation.
+define make-eigmt-rule
+test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
+	$(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
+endef
+
+# Instantiate the rule function make-st-rule() for each Eigen implementation.
+$(foreach dt,$(DTS), \
+$(foreach tr,$(TRANS), \
+$(foreach st,$(STORS), \
+$(foreach sh,$(SHAPES), \
+$(foreach sm,$(SMS), \
+$(foreach sn,$(SNS), \
+$(foreach sk,$(SKS), \
+$(foreach impl,$(EIMPLS), \
+$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
+
+
+# -- Executable file rules --
+
+# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
+# on the link command line in case BLIS was configured with the BLAS
+# compatibility layer. This prevents BLIS from inadvertently getting called
+# for the BLAS routines we are trying to test with.
+
+test_%_blissup_st.x:  test_%_blissup_st.o  $(LIBBLIS_LINK)
+	$(CC) $(strip $<                       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_blislpab_st.x: test_%_blislpab_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_eigen_st.x:    test_%_eigen_st.o    $(LIBBLIS_LINK)
+	$(CXX) $(strip $<                      $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(OPENBLAS_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_blasfeo_st.x:  test_%_blasfeo_st.o  $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(BLASFEO_LIB)       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_libxsmm_st.x:  test_%_libxsmm_st.o  $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(LIBXSMM_LIB)       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_vendor_st.x:   test_%_vendor_st.o   $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(VENDOR_LIB)        $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_blissup_mt.x:  test_%_blissup_mt.o  $(LIBBLIS_LINK)
+	$(CC) $(strip $<                       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_blislpab_mt.x: test_%_blislpab_mt.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_eigen_mt.x:    test_%_eigen_mt.o    $(LIBBLIS_LINK)
+	$(CXX) $(strip $<                      $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(OPENBLASP_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_vendor_mt.x:   test_%_vendor_mt.o   $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(VENDORP_LIB)       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+# -- Clean rules --
+
+clean: cleanx
+
+cleanx:
+	- $(RM_F) *.x *.o
+
diff --git a/test/supmt/octave/gen_opsupnames.m b/test/supmt/octave/gen_opsupnames.m
new file mode 100644
index 000000000..2debd4142
--- /dev/null
+++ b/test/supmt/octave/gen_opsupnames.m
@@ -0,0 +1,52 @@
+function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims )
+
+nops = size( ops, 1 );
+
+smallm = smalldims( 1 );
+smalln = smalldims( 2 );
+smallk = smalldims( 3 );
+
+i = 1;
+
+for io = 1:nops
+
+	op = ops( io, : );
+
+	str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm );
+	str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln );
+	str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk );
+	str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk );
+	str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk );
+	str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln );
+	str6 = sprintf( '%s_%s_mpnpkp', op, stor );
+
+	%opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm )
+	%opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln )
+	%opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk )
+	%opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk )
+	%opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk )
+	%opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln )
+	%opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor )
+
+	opsupnames( i+0, : ) = sprintf( '%-20s', str0 );
+	opsupnames( i+1, : ) = sprintf( '%-20s', str1 );
+	opsupnames( i+2, : ) = sprintf( '%-20s', str2 );
+	opsupnames( i+3, : ) = sprintf( '%-20s', str3 );
+	opsupnames( i+4, : ) = sprintf( '%-20s', str4 );
+	opsupnames( i+5, : ) = sprintf( '%-20s', str5 );
+	opsupnames( i+6, : ) = sprintf( '%-20s', str6 );
+
+	opnames( i+0, : ) = sprintf( '%s', op );
+	opnames( i+1, : ) = sprintf( '%s', op );
+	opnames( i+2, : ) = sprintf( '%s', op );
+	opnames( i+3, : ) = sprintf( '%s', op );
+	opnames( i+4, : ) = sprintf( '%s', op );
+	opnames( i+5, : ) = sprintf( '%s', op );
+	opnames( i+6, : ) = sprintf( '%s', op );
+
+	i = i + 7;
+end
+
+r_val1 = opsupnames;
+r_val2 = opnames;
+
diff --git a/test/supmt/octave/plot_l3sup_perf.m b/test/supmt/octave/plot_l3sup_perf.m
new file mode 100644
index 000000000..28056e25a
--- /dev/null
+++ b/test/supmt/octave/plot_l3sup_perf.m
@@ -0,0 +1,250 @@
+function r_val = plot_l3sup_perf( opname, ...
+                                  data_blissup, ...
+                                  data_blislpab, ...
+                                  data_eigen, ...
+                                  data_open, ...
+                                  data_vend, vend_str, ...
+                                  nth, ...
+                                  rows, cols, ...
+                                  cfreq, ...
+                                  dfps, ...
+                                  theid, impl )
+%if ... %mod(theid-1,cols) == 2 || ...
+%   ... %mod(theid-1,cols) == 3 || ...
+%   ... %mod(theid-1,cols) == 4 || ...
+%   0 == 1 ... %theid >= 19
+%	show_plot = 0;
+%else
+	show_plot = 1;
+%end
+
+%legend_plot_id = 11;
+legend_plot_id = 1*cols + 1*5;
+
+if 1
+ax1 = subplot( rows, cols, theid );
+hold( ax1, 'on' );
+end
+
+% Set line properties.
+color_blissup  = 'k'; lines_blissup  = '-';  markr_blissup  = '';
+color_blislpab = 'k'; lines_blislpab = ':';  markr_blislpab = '';
+color_eigen    = 'm'; lines_eigen    = '-.'; markr_eigen    = 'o';
+color_open     = 'r'; lines_open     = '--'; markr_open     = 'o';
+color_vend     = 'b'; lines_vend     = '-.'; markr_vend     = '.';
+
+% Compute the peak performance in terms of the number of double flops
+% executable per cycle and the clock rate.
+if opname(1) == 's' || opname(1) == 'c'
+	flopspercycle = dfps * 2;
+else
+	flopspercycle = dfps;
+end
+max_perf_core = (flopspercycle * cfreq) * 1;
+
+% Escape underscores in the title.
+title_opname = strrep( opname, '_', '\_' );
+
+% Print the title to a string.
+titlename = '%s';
+titlename = sprintf( titlename, title_opname );
+
+% Set the legend strings.
+blissup_legend  = sprintf( 'BLIS sup' );
+blislpab_legend = sprintf( 'BLIS conv' );
+eigen_legend    = sprintf( 'Eigen' );
+open_legend     = sprintf( 'OpenBLAS' );
+%vend_legend     = sprintf( 'MKL' );
+%vend_legend     = sprintf( 'ARMPL' );
+vend_legend     = vend_str;
+
+% Set axes range values.
+y_scale = 1.00;
+x_begin = 0;
+%x_end is set below.
+y_begin = 0;
+y_end   = max_perf_core * y_scale;
+
+% Set axes names.
+if nth == 1
+	yaxisname = 'GFLOPS';
+else
+	yaxisname = 'GFLOPS/core';
+end
+
+
+%flopscol = 4;
+flopscol = size( data_blissup, 2 );
+msize = 5;
+if 1
+fontsize = 11;
+else
+fontsize = 16;
+end
+linesize = 0.5;
+legend_loc = 'southeast';
+
+% --------------------------------------------------------------------
+
+% Automatically detect a column with the increasing problem size.
+% Then set the maximum x-axis value.
+for psize_col = 1:3
+	if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col )
+		break;
+	end
+end
+x_axis( :, 1 ) = data_blissup( :, psize_col );
+
+% Compute the number of data points we have in the x-axis. Note that
+% we only use quarter the data points for the m = n = k column of graphs.
+if mod(theid-1,cols) == 6
+	np = size( data_blissup, 1 ) / 4;
+else
+	np = size( data_blissup, 1 );
+end
+
+% Grab the last x-axis value.
+x_end = data_blissup( np, psize_col );
+
+%data_peak( 1, 1:2 ) = [     0 max_perf_core ];
+%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
+
+if show_plot == 1
+blissup_ln  = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
+                    'Color',color_blissup, 'LineStyle',lines_blissup, ...
+                    'LineWidth',linesize );
+blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
+                    'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
+                    'LineWidth',linesize );
+eigen_ln    = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
+                    'Color',color_eigen, 'LineStyle',lines_eigen, ...
+                    'LineWidth',linesize );
+open_ln     = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
+                    'Color',color_open, 'LineStyle',lines_open, ...
+                    'LineWidth',linesize );
+vend_ln     = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
+                    'Color',color_vend, 'LineStyle',lines_vend, ...
+                    'LineWidth',linesize );
+else
+if theid == legend_plot_id
+blissup_ln  = line( nan, nan, ...
+                    'Color',color_blissup, 'LineStyle',lines_blissup, ...
+                    'LineWidth',linesize );
+blislpab_ln = line( nan, nan, ...
+                    'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
+                    'LineWidth',linesize );
+eigen_ln    = line( nan, nan, ...
+                    'Color',color_eigen, 'LineStyle',lines_eigen, ...
+                    'LineWidth',linesize );
+open_ln     = line( nan, nan, ...
+                    'Color',color_open, 'LineStyle',lines_open, ...
+                    'LineWidth',linesize );
+vend_ln     = line( nan, nan, ...
+                    'Color',color_vend, 'LineStyle',lines_vend, ...
+                    'LineWidth',linesize );
+end
+end
+
+
+xlim( ax1, [x_begin x_end] );
+ylim( ax1, [y_begin y_end] );
+
+if     6000 <= x_end && x_end < 10000
+	x_tick2 = x_end - 2000;
+	x_tick1 = x_tick2/2;
+	xticks( ax1, [ x_tick1 x_tick2 ] );
+elseif 4000 <= x_end && x_end < 6000
+	x_tick2 = x_end - 1000;
+	x_tick1 = x_tick2/2;
+	xticks( ax1, [ x_tick1 x_tick2 ] );
+elseif 2000 <= x_end && x_end < 3000
+	x_tick2 = x_end - 400;
+	x_tick1 = x_tick2/2;
+	xticks( ax1, [ x_tick1 x_tick2 ] );
+elseif 500 <= x_end && x_end < 1000
+	x_tick3 = x_end*(3/4);
+	x_tick2 = x_end*(2/4);
+	x_tick1 = x_end*(1/4);
+	xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
+end
+
+if show_plot == 1 || theid == legend_plot_id
+	if theid == legend_plot_id
+		leg = legend( ...
+		[ ...
+		  blissup_ln ...
+		  blislpab_ln ...
+		  eigen_ln ...
+		  open_ln ...
+		  vend_ln ...
+		], ...
+		blissup_legend, ...
+		blislpab_legend, ...
+		eigen_legend, ...
+		open_legend, ...
+		vend_legend, ...
+		'Location', legend_loc );
+		set( leg,'Box','off' );
+		set( leg,'Color','none' );
+		set( leg,'Units','inches' );
+		if impl == 'octave'
+		set( leg,'FontSize',fontsize );
+		set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl)
+		else
+		set( leg,'FontSize',fontsize-1 );
+		set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
+		end
+		set( leg,'Box','off' );
+		set( leg,'Color','none' );
+		set( leg,'Units','inches' );
+		%                    xpos ypos
+		%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
+	end
+end
+
+set( ax1,'FontSize',fontsize );
+set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
+box( ax1, 'on' );
+
+titl = title( titlename );
+set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
+
+if impl == 'octave'
+tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
+tpos(1) = tpos(1) + -40;
+set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
+end
+
+if theid > (rows-1)*cols
+%xlab = xlabel( ax1,xaxisname );
+%tpos = get( xlab, 'Position' )
+%tpos(2) = tpos(2) + 10;
+%set( xlab, 'Position', tpos );
+	if     theid == rows*cols - 6
+	xlab = xlabel( ax1, 'm = 6; n = k' );
+	elseif theid == rows*cols - 5
+	xlab = xlabel( ax1, 'n = 8; m = k' );
+	elseif theid == rows*cols - 4
+	xlab = xlabel( ax1, 'k = 10; m = n' );
+	elseif theid == rows*cols - 3
+	xlab = xlabel( ax1, 'm; n = 8, k = 10' );
+	elseif theid == rows*cols - 2
+	xlab = xlabel( ax1, 'n; m = 6, k = 10' );
+	elseif theid == rows*cols - 1
+	xlab = xlabel( ax1, 'k; m = 6, n = 8' );
+	elseif theid == rows*cols - 0
+	xlab = xlabel( ax1, 'm = n = k' );
+	end
+end
+
+if mod(theid-1,cols) == 0
+ylab = ylabel( ax1,yaxisname );
+end
+
+%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
+%saveas( fig, filename_png );
+
+%hold( ax1, 'off' );
+
+r_val = 0;
+
diff --git a/test/supmt/octave/plot_panel_trxsh.m b/test/supmt/octave/plot_panel_trxsh.m
new file mode 100644
index 000000000..cf5e860a5
--- /dev/null
+++ b/test/supmt/octave/plot_panel_trxsh.m
@@ -0,0 +1,152 @@
+function r_val = plot_panel_trxsh ...
+     ( ...
+       cfreq, ...
+       dflopspercycle, ...
+       nth, ...
+       thr_str, ...
+       dt_ch, ...
+       stor_str, ...
+       smalldims, ...
+       dirpath, ...
+       arch_str, ...
+       vend_str, ...
+       impl ...
+     )
+
+%cfreq = 1.8;
+%dflopspercycle = 32;
+
+% Create filename "templates" for the files that contain the performance
+% results.
+filetemp_blissup  = '%s/output_%s_%s_blissup.m';
+filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
+filetemp_eigen    = '%s/output_%s_%s_eigen.m';
+filetemp_open     = '%s/output_%s_%s_openblas.m';
+filetemp_vend     = '%s/output_%s_%s_vendor.m';
+
+% Create a variable name "template" for the variables contained in the
+% files outlined above.
+vartemp = 'data_%s_%s_%s( :, : )';
+
+% Define the datatypes and operations we will be plotting.
+oproot      = sprintf( '%cgemm', dt_ch );
+ops( 1, : ) = sprintf( '%s_nn', oproot );
+ops( 2, : ) = sprintf( '%s_nt', oproot );
+ops( 3, : ) = sprintf( '%s_tn', oproot );
+ops( 4, : ) = sprintf( '%s_tt', oproot );
+
+% Generate datatype-specific operation names from the set of operations
+% and datatypes.
+[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims );
+n_opsupnames = size( opsupnames, 1 );
+
+%opsupnames
+%opnames
+%return
+
+if 1 == 1
+	%fig = figure('Position', [100, 100, 2400, 1500]);
+	fig = figure('Position', [100, 100, 2400, 1200]);
+	orient( fig, 'portrait' );
+	set(gcf,'PaperUnits', 'inches');
+	if impl == 'matlab'
+		set(gcf,'PaperSize', [11.5 20.4]);
+		set(gcf,'PaperPosition', [0 0 11.5 20.4]);
+		set(gcf,'PaperPositionMode','manual');
+	else % impl == 'octave' % octave 4.x
+	   set(gcf,'PaperSize', [12 21.5]);
+	   set(gcf,'PaperPositionMode','auto');
+	end
+	set(gcf,'PaperOrientation','landscape');
+end
+
+
+% Iterate over the list of datatype-specific operation names.
+for opi = 1:n_opsupnames
+%for opi = 1:1
+
+	% Grab the current datatype combination.
+	opsupname = opsupnames( opi, : );
+	opname    = opnames( opi, : );
+
+	opsupname = strtrim( opsupname );
+	opname    = strtrim( opname );
+
+	str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str);
+
+	% Construct filenames for the data files from templates.
+	file_blissup  = sprintf( filetemp_blissup,  dirpath, thr_str, opsupname );
+	file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname );
+	file_eigen    = sprintf( filetemp_eigen,    dirpath, thr_str, opsupname );
+	file_open     = sprintf( filetemp_open,     dirpath, thr_str, opsupname );
+	file_vend     = sprintf( filetemp_vend,     dirpath, thr_str, opsupname );
+
+	% Load the data files.
+	%str = sprintf( '  Loading %s', file_blissup ); disp(str);
+	run( file_blissup )
+	run( file_blislpab )
+	run( file_eigen )
+	run( file_open )
+	run( file_vend )
+
+	% Construct variable names for the variables in the data files.
+	var_blissup  = sprintf( vartemp, thr_str, opname, 'blissup' );
+	var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' );
+	var_eigen    = sprintf( vartemp, thr_str, opname, 'eigen' );
+	var_open     = sprintf( vartemp, thr_str, opname, 'openblas' );
+	var_vend     = sprintf( vartemp, thr_str, opname, 'vendor' );
+
+	% Use eval() to instantiate the variable names constructed above,
+	% copying each to a simplified name.
+	data_blissup  = eval( var_blissup );  % e.g. data_st_dgemm_blissup( :, : );
+	data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : );
+	data_eigen = eval( var_eigen );       % e.g. data_st_dgemm_eigen( :, : );
+	data_open = eval( var_open );         % e.g. data_st_dgemm_openblas( :, : );
+	data_vend = eval( var_vend );         % e.g. data_st_dgemm_vendor( :, : );
+
+	%str = sprintf( '  Reading %s', var_blissup ); disp(str);
+	%str = sprintf( '  Reading %s', var_blislpab ); disp(str);
+	%str = sprintf( '  Reading %s', var_eigen ); disp(str);
+	%str = sprintf( '  Reading %s', var_open ); disp(str);
+	%str = sprintf( '  Reading %s', var_bfeo ); disp(str);
+	%str = sprintf( '  Reading %s', var_xsmm ); disp(str);
+	%str = sprintf( '  Reading %s', var_vend ); disp(str);
+
+	% Plot one result in an m x n grid of plots, via the subplot()
+	% function.
+	if 1 == 1
+	plot_l3sup_perf( opsupname, ...
+	                 data_blissup, ...
+	                 data_blislpab, ...
+	                 data_eigen, ...
+	                 data_open, ...
+	                 data_vend, vend_str, ...
+	                 nth, ...
+	                 4, 7, ...
+	                 cfreq, ...
+	                 dflopspercycle, ...
+	                 opi, impl );
+
+	clear data_mt_*gemm_*;
+	clear data_blissup;
+	clear data_blislpab;
+	clear data_eigen;
+	clear data_open;
+	clear data_vend;
+
+	end
+
+end
+
+% Construct the name of the file to which we will output the graph.
+outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth );
+
+% Output the graph to pdf format.
+%print(gcf, 'gemm_md','-fillpage','-dpdf');
+%print(gcf, outfile,'-bestfit','-dpdf');
+if impl == 'octave'
+print(gcf, outfile);
+else % if impl == 'matlab'
+print(gcf, outfile,'-bestfit','-dpdf');
+end
+
diff --git a/test/supmt/octave/runthese.m b/test/supmt/octave/runthese.m
new file mode 100644
index 000000000..5946d4796
--- /dev/null
+++ b/test/supmt/octave/runthese.m
@@ -0,0 +1,12 @@
+
+% haswell
+plot_panel_trxsh(3.25,16,1,'mt','d','ccc',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.25,16,1,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
+
+% kabylake
+plot_panel_trxsh(3.80,16,1,'mt','d','rrr',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.80,16,1,'mt','d','ccc',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
+
+% epyc
+plot_panel_trxsh(3.00, 8,1,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.00, 8,1,'mt','d','ccc',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
diff --git a/test/supmt/runme.sh b/test/supmt/runme.sh
new file mode 100755
index 000000000..e878d76b0
--- /dev/null
+++ b/test/supmt/runme.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# File pefixes.
+exec_root="test"
+out_root="output"
+
+sys="blis"
+#sys="lonestar5"
+#sys="ul252"
+#sys="ul264"
+
+if [ ${sys} = "blis" ]; then
+
+	export GOMP_CPU_AFFINITY="0-3"
+	nt=4
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	export GOMP_CPU_AFFINITY="0-23"
+	nt=24
+
+elif [ ${sys} = "ul252" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0-51"
+	nt=52
+
+elif [ ${sys} = "ul264" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0-63"
+	nt=64
+
+fi
+
+# Delay between test cases.
+delay=0.02
+
+# Threadedness to test.
+threads="mt"
+
+# Datatypes to test.
+#dts="d s"
+dts="d"
+
+# Operations to test.
+ops="gemm"
+
+# Transpose combintions to test.
+trans="nn nt tn tt"
+
+# Storage combinations to test.
+#stors="rrr rrc rcr rcc crr crc ccr ccc"
+stors="rrr ccc"
+
+# Problem shapes to test.
+shapes="sll lsl lls lss sls ssl lll"
+
+# FGVZ: figure out how to probe what's in the directory and
+# execute everything that's there?
+sms="6"
+sns="8"
+sks="10"
+
+# Implementations to test.
+impls="vendor blissup blislpab openblas eigen"
+#impls="vendor"
+#impls="blissup"
+#impls="blislpab"
+#impls="openblas"
+#impls="eigen"
+
+# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
+# restore the value.
+GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
+
+# Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x
+
+for th in ${threads}; do
+
+	for dt in ${dts}; do
+
+		for op in ${ops}; do
+
+			for tr in ${trans}; do
+
+				for st in ${stors}; do
+
+					for sh in ${shapes}; do
+
+						for sm in ${sms}; do
+
+							for sn in ${sns}; do
+
+								for sk in ${sks}; do
+
+									for im in ${impls}; do
+
+										if   [ "${im:0:4}" = "blis" ]; then
+											unset  OMP_NUM_THREADS
+											export BLIS_NUM_THREADS=${nt}
+										elif [ "${im}" = "openblas" ]; then
+											unset  OMP_NUM_THREADS
+											export OPENBLAS_NUM_THREADS=${nt}
+										elif [ "${im}" = "eigen" ]; then
+											export OMP_NUM_THREADS=${nt}
+										elif [ "${im}" = "vendor" ]; then
+											unset  OMP_NUM_THREADS
+											export MKL_NUM_THREADS=${nt}
+										fi
+
+										# Multithreaded OpenBLAS seems to have a problem
+										# running properly if GOMP_CPU_AFFINITY is set.
+										# So we temporarily unset it here if we are about
+										# to execute OpenBLAS, but otherwise restore it.
+										if [ ${im} = "openblas" ]; then
+											unset GOMP_CPU_AFFINITY
+										else
+											export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
+										fi
+
+										# Limit execution of non-BLIS implementations to
+										# rrr/ccc storage cases.
+										if [ "${im:0:4}" != "blis" ] && \
+										   [ "${st}" != "rrr" ] && \
+										   [ "${st}" != "ccc" ]; then
+											continue;
+										fi
+
+										# Further limit execution of libxsmm to
+										# ccc storage cases.
+										if [ "${im:0:7}" = "libxsmm" ] && \
+										   [ "${st}" != "ccc" ]; then
+											continue;
+										fi
+
+										# Extract the shape chars for m, n, k.
+										chm=${sh:0:1}
+										chn=${sh:1:1}
+										chk=${sh:2:1}
+
+										# Construct the shape substring (e.g. m6npkp)
+										shstr=""
+
+										if [ ${chm} = "s" ]; then
+											shstr="${shstr}m${sm}"
+										else
+											shstr="${shstr}mp"
+										fi
+
+										if [ ${chn} = "s" ]; then
+											shstr="${shstr}n${sn}"
+										else
+											shstr="${shstr}np"
+										fi
+
+										if [ ${chk} = "s" ]; then
+											shstr="${shstr}k${sk}"
+										else
+											shstr="${shstr}kp"
+										fi
+
+										# Ex: test_dgemm_nn_rrc_m6npkp_blissup_st.x
+
+										# Construct the name of the test executable.
+										exec_name="${exec_root}_${dt}${op}_${tr}_${st}_${shstr}_${im}_${th}.x"
+
+										# Construct the name of the output file.
+										out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m"
+
+										echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}"
+
+										# Run executable.
+										./${exec_name} > ${out_file}
+
+										sleep ${delay}
+
+									done
+								done
+							done
+						done
+					done
+				done
+			done
+		done
+	done
+done
+
diff --git a/test/supmt/test_gemm.c b/test/supmt/test_gemm.c
new file mode 100644
index 000000000..95e9d45b2
--- /dev/null
+++ b/test/supmt/test_gemm.c
@@ -0,0 +1,589 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#ifdef EIGEN
+  #define BLIS_DISABLE_BLAS_DEFS
+  #include "blis.h"
+  #include <Eigen/Core>
+  //#include <Eigen/src/misc/blas.h>
+  using namespace Eigen;
+#else
+  #include "blis.h"
+#endif
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	rntm_t rntm_g;
+
+	bli_init();
+
+	// Copy the global rntm_t object in case we need it later when disabling
+	// sup.
+	bli_rntm_init_from_global( &rntm_g );
+
+#ifndef ERROR_CHECK
+	bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+#endif
+
+
+	dim_t n_trials = N_TRIALS;
+
+	num_t dt      = DT;
+
+#if 1
+	dim_t p_begin = P_BEGIN;
+	dim_t p_max   = P_MAX;
+	dim_t p_inc   = P_INC;
+#else
+	dim_t p_begin = 4;
+	dim_t p_max   = 40;
+	dim_t p_inc   = 4;
+#endif
+
+#if 1
+	dim_t m_input = M_DIM;
+	dim_t n_input = N_DIM;
+	dim_t k_input = K_DIM;
+#else
+	p_begin = p_inc = 32;
+	dim_t m_input = 6;
+	dim_t n_input = -1;
+	dim_t k_input = -1;
+#endif
+
+#if 1
+	trans_t transa = TRANSA;
+	trans_t transb = TRANSB;
+#else
+	trans_t transa = BLIS_NO_TRANSPOSE;
+	trans_t transb = BLIS_NO_TRANSPOSE;
+#endif
+
+#if 1
+	stor3_t sc = STOR3;
+#else
+	stor3_t sc = BLIS_RRR;
+#endif
+
+
+	inc_t    rs_c, cs_c;
+	inc_t    rs_a, cs_a;
+	inc_t    rs_b, cs_b;
+
+	if      ( sc == BLIS_RRR ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = -1; }
+	else if ( sc == BLIS_RRC ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b =  0; }
+	else if ( sc == BLIS_RCR ) { rs_c = cs_c = -1; rs_a = cs_a =  0; rs_b = cs_b = -1; }
+	else if ( sc == BLIS_RCC ) { rs_c = cs_c = -1; rs_a = cs_a =  0; rs_b = cs_b =  0; }
+	else if ( sc == BLIS_CRR ) { rs_c = cs_c =  0; rs_a = cs_a = -1; rs_b = cs_b = -1; }
+	else if ( sc == BLIS_CRC ) { rs_c = cs_c =  0; rs_a = cs_a = -1; rs_b = cs_b =  0; }
+	else if ( sc == BLIS_CCR ) { rs_c = cs_c =  0; rs_a = cs_a =  0; rs_b = cs_b = -1; }
+	else if ( sc == BLIS_CCC ) { rs_c = cs_c =  0; rs_a = cs_a =  0; rs_b = cs_b =  0; }
+	else                       { bli_abort(); }
+
+	f77_int cbla_storage;
+
+	if      ( sc == BLIS_RRR ) cbla_storage = CblasRowMajor;
+	else if ( sc == BLIS_CCC ) cbla_storage = CblasColMajor;
+	else                       cbla_storage = -1;
+
+	( void )cbla_storage;
+
+
+	char dt_ch;
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+	f77_char f77_transa;
+	f77_char f77_transb;
+	char     transal, transbl;
+
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	transal = tolower( f77_transa );
+	transbl = tolower( f77_transb );
+
+	f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
+	f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
+
+	( void )cbla_transa;
+	( void )cbla_transb;
+
+	dim_t p;
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
+
+	printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
+	                                  transal, transbl, STR );
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
+	{
+		obj_t  a, b, c;
+		obj_t  c_save;
+		obj_t  alpha, beta;
+		dim_t  m, n, k;
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+		bli_obj_create( dt, m, n, rs_c, cs_c, &c );
+		bli_obj_create( dt, m, n, rs_c, cs_c, &c_save );
+
+		if ( bli_does_notrans( transa ) )
+			bli_obj_create( dt, m, k, rs_a, cs_a, &a );
+		else
+			bli_obj_create( dt, k, m, rs_a, cs_a, &a );
+
+		if ( bli_does_notrans( transb ) )
+			bli_obj_create( dt, k, n, rs_b, cs_b, &b );
+		else
+			bli_obj_create( dt, n, k, rs_b, cs_b, &b );
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (1.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &beta );
+
+		bli_copym( &c, &c_save );
+
+#ifdef EIGEN
+		double alpha_r, alpha_i;
+
+		bli_getsc( &alpha, &alpha_r, &alpha_i );
+
+		void* ap = bli_obj_buffer_at_off( &a );
+		void* bp = bli_obj_buffer_at_off( &b );
+		void* cp = bli_obj_buffer_at_off( &c );
+
+		const int os_a = ( bli_obj_is_col_stored( &a ) ? bli_obj_col_stride( &a )
+		                                               : bli_obj_row_stride( &a ) );
+		const int os_b = ( bli_obj_is_col_stored( &b ) ? bli_obj_col_stride( &b )
+		                                               : bli_obj_row_stride( &b ) );
+		const int os_c = ( bli_obj_is_col_stored( &c ) ? bli_obj_col_stride( &c )
+		                                               : bli_obj_row_stride( &c ) );
+
+		Stride<Dynamic,1> stride_a( os_a, 1 );
+		Stride<Dynamic,1> stride_b( os_b, 1 );
+		Stride<Dynamic,1> stride_c( os_c, 1 );
+
+		#if defined(IS_FLOAT)
+		#elif defined (IS_DOUBLE)
+			#ifdef A_STOR_R
+			typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_A;
+			#else
+			typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_A;
+			#endif
+			#ifdef B_STOR_R
+			typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_B;
+			#else
+			typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_B;
+			#endif
+			#ifdef C_STOR_R
+			typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_C;
+			#else
+			typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_C;
+			#endif
+
+			#ifdef A_NOTRANS  // A is not transposed
+			Map<MatrixXd_A,  0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
+			#else // A is transposed
+			Map<MatrixXd_A,  0, Stride<Dynamic,1> > A( ( double* )ap, k, m, stride_a );
+			#endif
+
+			#ifdef B_NOTRANS // B is not transposed
+			Map<MatrixXd_B,  0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
+			#else // B is transposed
+			Map<MatrixXd_B,  0, Stride<Dynamic,1> > B( ( double* )bp, n, k, stride_b );
+			#endif
+
+			Map<MatrixXd_C,  0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
+		#endif
+#endif
+
+
+		double dtime_save = DBL_MAX;
+
+		for ( dim_t r = 0; r < n_trials; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			double dtime = bli_clock();
+
+
+#ifdef EIGEN
+
+		#ifdef A_NOTRANS
+			#ifdef B_NOTRANS
+			C.noalias() += alpha_r * A * B;
+			#else // B_TRANS
+			C.noalias() += alpha_r * A * B.transpose();
+			#endif
+		#else // A_TRANS
+			#ifdef B_NOTRANS
+			C.noalias() += alpha_r * A.transpose() * B;
+			#else // B_TRANS
+			C.noalias() += alpha_r * A.transpose() * B.transpose();
+			#endif
+		#endif
+
+#endif
+#ifdef BLIS
+	#ifdef SUP
+			// Allow sup.
+			bli_gemm( &alpha,
+			          &a,
+			          &b,
+			          &beta,
+			          &c );
+	#else
+			// Disable sup and use the expert interface.
+			//rntm_t rntm = BLIS_RNTM_INITIALIZER;
+			rntm_t rntm = rntm_g;
+			bli_rntm_disable_l3_sup( &rntm );
+
+			bli_gemm_ex( &alpha,
+			             &a,
+			             &b,
+			             &beta,
+			             &c, NULL, &rntm );
+	#endif
+#endif
+#ifdef BLAS
+			if ( bli_is_float( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				float*    alphap = ( float* )bli_obj_buffer( &alpha );
+				float*    ap     = ( float* )bli_obj_buffer( &a );
+				float*    bp     = ( float* )bli_obj_buffer( &b );
+				float*    betap  = ( float* )bli_obj_buffer( &beta );
+				float*    cp     = ( float* )bli_obj_buffer( &c );
+
+				#ifdef XSMM
+				libxsmm_sgemm( &f77_transa,
+				#else
+				sgemm_( &f77_transa,
+				#endif
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+			else if ( bli_is_double( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				double*   alphap = ( double* )bli_obj_buffer( &alpha );
+				double*   ap     = ( double* )bli_obj_buffer( &a );
+				double*   bp     = ( double* )bli_obj_buffer( &b );
+				double*   betap  = ( double* )bli_obj_buffer( &beta );
+				double*   cp     = ( double* )bli_obj_buffer( &c );
+
+				#ifdef XSMM
+				libxsmm_dgemm( &f77_transa,
+				#else
+				dgemm_( &f77_transa,
+				#endif
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+			else if ( bli_is_scomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
+				scomplex* ap     = ( scomplex* )bli_obj_buffer( &a );
+				scomplex* bp     = ( scomplex* )bli_obj_buffer( &b );
+				scomplex* betap  = ( scomplex* )bli_obj_buffer( &beta );
+				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
+
+				#ifdef XSMM
+				libxsmm_cgemm( &f77_transa,
+				#else
+				cgemm_( &f77_transa,
+				#endif
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+			else if ( bli_is_dcomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
+				dcomplex* ap     = ( dcomplex* )bli_obj_buffer( &a );
+				dcomplex* bp     = ( dcomplex* )bli_obj_buffer( &b );
+				dcomplex* betap  = ( dcomplex* )bli_obj_buffer( &beta );
+				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
+
+				#ifdef XSMM
+				libxsmm_zgemm( &f77_transa,
+				#else
+				zgemm_( &f77_transa,
+				#endif
+				        &f77_transb,
+				        &mm,
+				        &nn,
+				        &kk,
+				        alphap,
+				        ap, &lda,
+				        bp, &ldb,
+				        betap,
+				        cp, &ldc );
+			}
+#endif
+#ifdef CBLAS
+			if ( bli_is_float( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+	#ifdef C_STOR_R
+				f77_int   lda    = bli_obj_row_stride( &a );
+				f77_int   ldb    = bli_obj_row_stride( &b );
+				f77_int   ldc    = bli_obj_row_stride( &c );
+	#else
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+	#endif
+				float*    alphap = bli_obj_buffer( &alpha );
+				float*    ap     = bli_obj_buffer( &a );
+				float*    bp     = bli_obj_buffer( &b );
+				float*    betap  = bli_obj_buffer( &beta );
+				float*    cp     = bli_obj_buffer( &c );
+
+				cblas_sgemm( cbla_storage,
+				             cbla_transa,
+				             cbla_transb,
+				             mm,
+				             nn,
+				             kk,
+				             *alphap,
+				             ap, lda,
+				             bp, ldb,
+				             *betap,
+				             cp, ldc );
+			}
+			else if ( bli_is_double( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+	#ifdef C_STOR_R
+				f77_int   lda    = bli_obj_row_stride( &a );
+				f77_int   ldb    = bli_obj_row_stride( &b );
+				f77_int   ldc    = bli_obj_row_stride( &c );
+	#else
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+	#endif
+				double*   alphap = bli_obj_buffer( &alpha );
+				double*   ap     = bli_obj_buffer( &a );
+				double*   bp     = bli_obj_buffer( &b );
+				double*   betap  = bli_obj_buffer( &beta );
+				double*   cp     = bli_obj_buffer( &c );
+
+				cblas_dgemm( cbla_storage,
+				             cbla_transa,
+				             cbla_transb,
+				             mm,
+				             nn,
+				             kk,
+				             *alphap,
+				             ap, lda,
+				             bp, ldb,
+				             *betap,
+				             cp, ldc );
+			}
+			else if ( bli_is_scomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+	#ifdef C_STOR_R
+				f77_int   lda    = bli_obj_row_stride( &a );
+				f77_int   ldb    = bli_obj_row_stride( &b );
+				f77_int   ldc    = bli_obj_row_stride( &c );
+	#else
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+	#endif
+				scomplex* alphap = bli_obj_buffer( &alpha );
+				scomplex* ap     = bli_obj_buffer( &a );
+				scomplex* bp     = bli_obj_buffer( &b );
+				scomplex* betap  = bli_obj_buffer( &beta );
+				scomplex* cp     = bli_obj_buffer( &c );
+
+				cblas_cgemm( cbla_storage,
+				             cbla_transa,
+				             cbla_transb,
+				             mm,
+				             nn,
+				             kk,
+				             alphap,
+				             ap, lda,
+				             bp, ldb,
+				             betap,
+				             cp, ldc );
+			}
+			else if ( bli_is_dcomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+	#ifdef C_STOR_R
+				f77_int   lda    = bli_obj_row_stride( &a );
+				f77_int   ldb    = bli_obj_row_stride( &b );
+				f77_int   ldc    = bli_obj_row_stride( &c );
+	#else
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+	#endif
+				dcomplex* alphap = bli_obj_buffer( &alpha );
+				dcomplex* ap     = bli_obj_buffer( &a );
+				dcomplex* bp     = bli_obj_buffer( &b );
+				dcomplex* betap  = bli_obj_buffer( &beta );
+				dcomplex* cp     = bli_obj_buffer( &c );
+
+				cblas_zgemm( cbla_storage,
+				             cbla_transa,
+				             cbla_transb,
+				             mm,
+				             nn,
+				             kk,
+				             alphap,
+				             ap, lda,
+				             bp, ldb,
+				             betap,
+				             cp, ldc );
+			}
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+		printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
+		                                  transal, transbl, STR );
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n,
+		        ( unsigned long )k, gflops );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+