"Merge Selective Packing code from amd branch flame/blis"

Change-Id: Ifbdf49735f56a66fbbc96dab6d3ca6069302daed
2026-04-20 07:38:53 +00:00 · 2019-12-16 14:46:19 +05:30
parent 307ddc3110
commit 6b5c68b9ed
52 changed files with 4202 additions and 762 deletions
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -176,6 +176,16 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  cntx
 	);

+#if 0
+	// Initialize the context with the sup handlers.
+	bli_cntx_set_l3_sup_handlers
+	(
+	  1,
+	  BLIS_GEMM, bli_gemmsup_ref,
+	  cntx
+	);
+#endif
+
 	// Update the context with optimized small/unpacked gemm kernels.
 	bli_cntx_set_l3_sup_kers
 	(
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -186,6 +186,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  cntx
 	);

+	// Initialize the context with the sup handlers.
+	bli_cntx_set_l3_sup_handlers
+	(
+	  1,
+	  BLIS_GEMM, bli_gemmsup_ref,
+	  cntx
+	);
+
 	// Update the context with optimized small/unpacked gemm kernels.
 	bli_cntx_set_l3_sup_kers
 	(
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -73,7 +73,11 @@

 // Prototype reference implementation of small/unpacked matrix handler.
 #include "bli_l3_sup_ref.h"
+#include "bli_l3_sup_int.h"
 #include "bli_l3_sup_vars.h"
+#include "bli_l3_sup_packm_a.h"
+#include "bli_l3_sup_packm_b.h"
+#include "bli_l3_sup_packm_var.h"

 // Prototype microkernel wrapper APIs.
 #include "bli_l3_ukr_oapi.h"
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -104,14 +104,6 @@ err_t bli_gemmsup
 	// that function assumes the context pointer is valid.
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();

-#if 0
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; }
-#endif
-
 	// Return early if a microkernel preference-induced transposition would
 	// have been performed and shifted the dimensions outside of the space
 	// of sup-handled problems.
@@ -138,6 +130,12 @@ err_t bli_gemmsup
 		}
 	}

+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
 #if 0
 const num_t dt = bli_obj_dt( c );
 const dim_t m  = bli_obj_length( c );
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -0,0 +1,173 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+err_t bli_gemmsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+#if 0
+	//bli_gemmsup_ref_var2
+	//bli_gemmsup_ref_var1
+	#if 0
+	bli_gemmsup_ref_var1n
+	#else
+	#endif
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
+	                                     stor_id == BLIS_RRC ||
+	                                     stor_id == BLIS_RCR ||
+	                                     stor_id == BLIS_CRR );
+	if ( is_rrr_rrc_rcr_crr )
+	{
+		bli_gemmsup_ref_var2m
+		(
+		  BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
+		);
+	}
+	else
+	{
+		bli_gemmsup_ref_var2m
+		(
+		  BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
+		);
+	}
+
+	return BLIS_SUCCESS;
+#endif
+
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+
+	// Don't use the small/unpacked implementation if one of the matrices
+	// uses general stride.
+	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
+
+	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
+	                                     stor_id == BLIS_RRC ||
+	                                     stor_id == BLIS_RCR ||
+	                                     stor_id == BLIS_CRR );
+	const bool_t  is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
+
+	const num_t   dt       = bli_obj_dt( c );
+	const bool_t  row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+
+	const bool_t  is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
+	                                      : is_rcc_crc_ccr_ccc );
+
+	if ( is_primary )
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for row-preferential kernels
+		//  - rcc crc ccr ccc for column-preferential kernels
+
+		const dim_t m  = bli_obj_length( c );
+		const dim_t n  = bli_obj_width( c );
+		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+		const dim_t mu = m / MR;
+		const dim_t nu = n / NR;
+
+		if ( mu >= nu )
+		//if ( m % 2 == 1 && n % 2 == 1 )
+		{
+			#ifdef TRACEVAR
+			printf( "bli_l3_sup_int(): var2m primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
+			bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
+			                       alpha, a, b, beta, c,
+			                       stor_id, cntx, rntm, cntl, thread );
+		}
+		else // if ( mu < nu )
+		{
+			#ifdef TRACEVAR
+			printf( "bli_l3_sup_int(): var1n primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
+			bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
+			                       alpha, a, b, beta, c,
+			                       stor_id, cntx, rntm, cntl, thread );
+		}
+	}
+	else
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for column-preferential kernels
+		//  - rcc crc ccr ccc for row-preferential kernels
+
+		const dim_t mt = bli_obj_width( c );
+		const dim_t nt = bli_obj_length( c );
+		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+		const dim_t mu = mt / MR;
+		const dim_t nu = nt / NR;
+
+		if ( mu >= nu )
+		//if ( mt % 2 == 1 && nt % 2 == 1 )
+		{
+			#ifdef TRACEVAR
+			printf( "bli_l3_sup_int(): var2m non-primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
+			bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
+			                       alpha, a, b, beta, c,
+			                       stor_id, cntx, rntm, cntl, thread );
+		}
+		else // if ( mu < nu )
+		{
+			#ifdef TRACEVAR
+			printf( "bli_l3_sup_int(): var1n non-primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
+			bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
+			                       alpha, a, b, beta, c,
+			                       stor_id, cntx, rntm, cntl, thread );
+		}
+		// *requires nudging of mc,nc up to be a multiple of nr,mr.
+	}
+
+	// Return success so that the caller knows that we computed the solution.
+	return BLIS_SUCCESS;
+}
+
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+err_t bli_gemmsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     );
--- a/frame/3/bli_l3_sup_packm_a.h
+++ b/frame/3/bli_l3_sup_packm_a.h
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       packbuf_t        pack_buf_type, \
+       stor3_t          stor_id, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           did_pack, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       stor3_t          stor_id, \
+       pack_t* restrict schema, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       dim_t*  restrict m_max, \
+       dim_t*  restrict k_max, \
+       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_init_a )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       stor3_t          stor_id, \
+       trans_t          transc, \
+       dim_t            m, \
+       dim_t            k, \
+       dim_t            mr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_a )
+
--- a/frame/3/bli_l3_sup_packm_b.h
+++ b/frame/3/bli_l3_sup_packm_b.h
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       packbuf_t        pack_buf_type, \
+       stor3_t          stor_id, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           did_pack, \
+       rntm_t* restrict rntm, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       stor3_t          stor_id, \
+       pack_t* restrict schema, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       dim_t*  restrict k_max, \
+       dim_t*  restrict n_max, \
+       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
+       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_init_b )
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       bool_t           will_pack, \
+       stor3_t          stor_id, \
+       trans_t          transc, \
+       dim_t            k, \
+       dim_t            n, \
+       dim_t            nr, \
+       ctype*  restrict kappa, \
+       ctype*  restrict x, inc_t           rs_x, inc_t           cs_x, \
+       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
+                                                 inc_t* restrict ps_p, \
+       cntx_t* restrict cntx, \
+       mem_t*  restrict mem, \
+       thrinfo_t* restrict thread  \
+     ); \
+
+INSERT_GENTPROT_BASIC0( packm_sup_b )
+
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -0,0 +1,329 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Define BLAS-like interfaces to the variants.
+//
+
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+\
+	dim_t           iter_dim; \
+	dim_t           n_iter; \
+	dim_t           it, ic; \
+	dim_t           ic0; \
+	doff_t          ic_inc; \
+	dim_t           panel_len_full; \
+	dim_t           panel_len_i; \
+	dim_t           panel_len_max; \
+	dim_t           panel_len_max_i; \
+	dim_t           panel_dim_i; \
+	dim_t           panel_dim_max; \
+	inc_t           vs_c; \
+	inc_t           ldc; \
+	inc_t           ldp, p_inc; \
+	conj_t          conjc; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	bool_t row_stored = bli_is_col_packed( schema ); \
+	/*bool_t col_stored = bli_is_row_packed( schema );*/ \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = cs_c; \
+		ldc            = rs_c; \
+		ldp            = rs_p; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = rs_c; \
+		ldc            = cs_c; \
+		ldp            = cs_p; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	/* Set the initial values and increments for indices related to C and P
+	   based on whether reverse iteration was requested. */ \
+	{ \
+		ic0    = 0; \
+		ic_inc = panel_dim_max; \
+	} \
+\
+	ctype* restrict p_begin = p_cast; \
+\
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
+\
+	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
+	( void )nt; \
+	( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    it  = 0; it < n_iter; \
+	      ic += ic_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		ctype* restrict c_begin = c_cast   + (ic  )*vs_c; \
+\
+		ctype* restrict c_use = c_begin; \
+		ctype* restrict p_use = p_begin; \
+\
+		{ \
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			{ \
+				PASTEMAC(ch,packm_cxk) \
+				( \
+				  conjc, \
+				  schema, \
+				  panel_dim_i, \
+				  panel_dim_max, \
+				  panel_len_i, \
+				  panel_len_max_i, \
+				  kappa_cast, \
+				  c_use, vs_c, ldc, \
+				  p_use,       ldp, \
+				  cntx  \
+				); \
+			} \
+\
+			/* NOTE: This value is equivalent to ps_p. */ \
+			p_inc = ps_p; \
+		} \
+\
+		p_begin += p_inc; \
+\
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
+                      p_use,         rs_p, cs_p, "%5.2f", "" ); \
+if ( !row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
+                      p_use,         rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+	} \
+\
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 )
+
+
+
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+if ( col_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+*/
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+/*
+if ( col_stored ) { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+else { \
+	if ( bli_thread_work_id( thread ) == 0 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+	if ( bli_thread_work_id( thread ) == 1 ) \
+	{ \
+	printf( "packm_blk_var1: thread %lu  (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
+	fflush( stdout ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
+	                      ( ctype* )c_use,         rs_c, cs_c, "%4.1f", "" ); \
+	PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
+	                      ( ctype* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+	fflush( stdout ); \
+	} \
+bli_thread_obarrier( thread ); \
+} \
+*/
+/*
+		if ( bli_is_4mi_packed( schema ) ) { \
+		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
+		if ( col_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		if ( row_stored ) { \
+		if ( 0 ) \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
+		                       ( ctype_r* )c_use,         2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,            rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+		} \
+*/
+/*
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+*/
+/*
+		if ( row_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		inc_t is_b = rs_p * *m_panel_max; \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
+/*
+		if ( col_stored ) { \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )c_use,        2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
+		                       (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
+		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
+		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
+		} \
+*/
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// Prototype BLAS-like interfaces to the variants.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t          transc, \
+       pack_t           schema, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            m_max, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+                           dim_t pd_p, inc_t ps_p, \
+       cntx_t* restrict cntx, \
+       thrinfo_t* restrict thread  \
+     );
+
+INSERT_GENTPROT_BASIC0( packm_sup_var1 )
+
--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -45,6 +45,11 @@ err_t bli_gemmsup_ref
       rntm_t* rntm
     )
 {
+	// This function implements the default gemmsup handler. If you are a
+	// BLIS developer and wish to use a different gemmsup handler, please
+	// register a different function pointer in the context in your
+	// sub-configuration's bli_cntx_init_*() function.
+
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -85,6 +90,14 @@ err_t bli_gemmsup_ref
 	//bli_rntm_set_pack_a( 0, rntm );
 	//bli_rntm_set_pack_b( 0, rntm );
 #endif
+	//bli_rntm_set_pack_a( 0, rntm );
+	//bli_rntm_set_pack_b( 0, rntm );
+
+	// May not need these here since packm_sup infers the schemas based
+	// on the stor3_t id. (This would also mean that they don't need to
+	// be passed into the thread decorator below.)
+	//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
+	//pack_t schema_b = BLIS_PACKED_COL_PANELS;

 	return
 	bli_l3_sup_thread_decorator
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -119,6 +119,9 @@ void bli_gemmsup_ref_var1n
 	const bool     packa     = bli_rntm_pack_a( rntm );
 	const bool     packb     = bli_rntm_pack_b( rntm );

+	const bool_t   packa     = bli_rntm_pack_a( rntm );
+	const bool_t   packb     = bli_rntm_pack_b( rntm );
+
 	const conj_t   conja     = bli_obj_conj_status( a );
 	const conj_t   conjb     = bli_obj_conj_status( b );

@@ -186,6 +189,8 @@ void bli_gemmsup_ref_var1n
 		// Invoke the function.
 		f
 		(
+		  packa,
+		  packb,
 		  conja,
 		  conjb,
 		  m,
@@ -207,6 +212,8 @@ void bli_gemmsup_ref_var1n
 		// Invoke the function (transposing the operation).
 		f
 		(
+		  packb,
+		  packa,
 		  conjb,             // swap the conj values.
 		  conja,
 		  n,                 // swap the m and n dimensions.
@@ -249,6 +256,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* restrict thread  \
     ) \
 { \
+	const num_t dt = PASTEMAC(ch,type); \
+\
 	/* If m or n is zero, return immediately. */ \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
@@ -270,16 +279,16 @@ void PASTEMAC(ch,varname) \
 		} \
 		return; \
 	} \
-\
-	const num_t dt  = PASTEMAC(ch,type); \
 \
 	/* This transposition of the stor3_t id value is inherent to variant 1.
 	   The reason: we assume that variant 2 is the "main" variant. The
 	   consequence of this is that we assume that the millikernels that
-	   iterate over m are registered to the kernel group associated with
-	   the kernel preference. So, regardless of whether the mkernels are
-	   row- or column-preferential, millikernels that iterate over n are
-	   always placed in the slots for the opposite kernel group. */ \
+	   iterate over m are registered to the "primary" kernel group associated
+	   with the kernel IO preference; similarly, mkernels that iterate over
+	   n are assumed to be registered to the "non-primary" group associated
+	   with the ("non-primary") anti-preference. Note that this pattern holds
+	   regardless of whether the mkernel set has a row or column preference.)
+	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
 	stor_id = bli_stor3_trans( stor_id ); \
 \
 	/* Query the context for various blocksizes. */ \
@@ -326,7 +335,9 @@ void PASTEMAC(ch,varname) \
 		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
 	} \
 \
-	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
+	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
+	   NOTE: This is unique to variant 1 (ie: not performed in variant 2)
+	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
 	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
 	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
 \
@@ -346,7 +357,11 @@ void PASTEMAC(ch,varname) \
 	const inc_t icstep_b = cs_b; \
 \
 	const inc_t jrstep_c = rs_c * MR; \
+\
+	/*
 	const inc_t jrstep_a = rs_a * MR; \
+	( void )jrstep_a; \
+	*/ \
 \
 	const inc_t irstep_c = cs_c * NR; \
 	const inc_t irstep_b = cs_b * NR; \
@@ -435,6 +450,45 @@ void PASTEMAC(ch,varname) \
 	/* Compute number of primary and leftover components of the JC loop. */ \
 	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
 	const dim_t jc_left =   m_local % NC; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. An alternative way of initializing the
+	   mem_t entries is:
+
+	     bli_mem_clear( &mem_a ); \
+	     bli_mem_clear( &mem_b ); \
+	*/ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Prepare the packing destination buffer. If packing is not requested for
+	   matrix B, this function will reduce to a no-op. */ \
+	PASTEMAC(ch,packm_sup_init_mem_a) \
+	( \
+	  packa, \
+	  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
+	  stor_id, \
+	  NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
+	  cntx, \
+	  rntm, \
+	  &mem_a, \
+	  thread  \
+	); \
+\
+	/* Prepare the packing destination buffer. If packing is not requested for
+	   matrix B, this function will reduce to a no-op. */ \
+	PASTEMAC(ch,packm_sup_init_mem_b) \
+	( \
+	  packb, \
+	  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
+	  stor_id, \
+	  KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
+	  cntx, \
+	  rntm, \
+	  &mem_b, \
+	  thread  \
+	); \
 \
 	/* Loop over the m dimension (NC rows/columns at a time). */ \
 	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -537,6 +591,39 @@ void PASTEMAC(ch,varname) \
 			/* Compute number of primary and leftover components of the IC loop. */ \
 			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
 			const dim_t ic_left =   n_local % MC; \
+\
+			ctype* a_use; \
+			inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   A. (If A will not be packed, then a_use will be set to point to
+			   a and the _a_use strides will be set accordingly.) Then call
+			   the packm sup variant chooser, which will call the appropriate
+			   implementation based on the schema deduced from the stor_id. */ \
+			PASTEMAC(ch,packm_sup_a) \
+			( \
+			  packa, \
+			  stor_id, \
+			  BLIS_NO_TRANSPOSE, \
+			  nc_cur, kc_cur, MR, \
+			  one, \
+			  a_pc,   rs_a,      cs_a, \
+			  &a_use, &rs_a_use, &cs_a_use, \
+			                     &ps_a_use, \
+			  cntx, \
+			  &mem_a, \
+			  thread  \
+			); \
+\
+			/* Alias a_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict a_pc_use = a_use; \
+\
+			/* We don't need to embed the panel stride of A within the auxinfo_t
+			   object because this variant iterates through A in the jr loop,
+			   which occurs here, within the macrokernel, not within the
+			   millikernel. */ \
+			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
 \
 			/* Loop over the n dimension (MC rows at a time). */ \
 			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -622,6 +709,41 @@ void PASTEMAC(ch,varname) \
 				/* Compute the JR loop thread range for the current thread. */ \
 				dim_t jr_start, jr_end; \
 				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				ctype* b_use; \
+				inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   B. (If B will not be packed, then b_use will be set to point to
+				   b and the _b_use strides will be set accordingly.) Then call
+				   the packm sup variant chooser, which will call the appropriate
+				   implementation based on the schema deduced from the stor_id.
+				   NOTE: packing matrix B in this panel-block algorithm corresponds
+				   to packing matrix A in the block-panel algorithm. */ \
+				PASTEMAC(ch,packm_sup_b) \
+				( \
+				  packb, \
+				  stor_id, \
+				  BLIS_NO_TRANSPOSE, \
+				  kc_cur, mc_cur, NR, \
+				  one, \
+				  b_ic,   rs_b,      cs_b, \
+				  &b_use, &rs_b_use, &cs_b_use, \
+				                     &ps_b_use, \
+				  cntx, \
+				  &mem_b, \
+				  thread  \
+				); \
+\
+				/* Alias b_use so that it's clear this is our current block of
+				   matrix B. */ \
+				ctype* restrict b_ic_use = b_use; \
+\
+				/* Embed the panel stride of B within the auxinfo_t object. The
+				   millikernel will query and use this to iterate through
+				   micropanels of B. */ \
+				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
+\
 \
 				/* Loop over the m dimension (NR columns at a time). */ \
 				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -651,10 +773,10 @@ void PASTEMAC(ch,varname) \
 						  mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
 						  kc_cur, \
 						  alpha_cast, \
-						  a_jr, rs_a, cs_a, \
-						  b_ic, rs_b, cs_b, \
+						  a_jr,     rs_a_use, cs_a_use, \
+						  b_ic_use, rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr, rs_c, cs_c, \
+						  c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
@@ -757,6 +879,9 @@ void bli_gemmsup_ref_var2m
 	const bool     packa     = bli_rntm_pack_a( rntm );
 	const bool     packb     = bli_rntm_pack_b( rntm );

+	const bool_t   packa     = bli_rntm_pack_a( rntm );
+	const bool_t   packb     = bli_rntm_pack_b( rntm );
+
 	const conj_t   conja     = bli_obj_conj_status( a );
 	const conj_t   conjb     = bli_obj_conj_status( b );

@@ -824,6 +949,8 @@ void bli_gemmsup_ref_var2m
 		// Invoke the function.
 		f
 		(
+		  packa,
+		  packb,
 		  conja,
 		  conjb,
 		  m,
@@ -845,6 +972,8 @@ void bli_gemmsup_ref_var2m
 		// Invoke the function (transposing the operation).
 		f
 		(
+		  packb,             // swap the pack values.
+		  packa,
 		  conjb,             // swap the conj values.
 		  conja,
 		  n,                 // swap the m and n dimensions.
@@ -887,6 +1016,8 @@ void PASTEMAC(ch,varname) \
       thrinfo_t* restrict thread  \
     ) \
 { \
+	const num_t dt = PASTEMAC(ch,type); \
+\
 	/* If m or n is zero, return immediately. */ \
 	if ( bli_zero_dim2( m, n ) ) return; \
 \
@@ -908,8 +1039,6 @@ void PASTEMAC(ch,varname) \
 		} \
 		return; \
 	} \
-\
-	const num_t dt  = PASTEMAC(ch,type); \
 \
 	/* Query the context for various blocksizes. */ \
 	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -972,6 +1101,8 @@ void PASTEMAC(ch,varname) \
 	const inc_t icstep_a = rs_a; \
 \
 	const inc_t jrstep_c = cs_c * NR; \
+\
+	/*
 	const inc_t jrstep_b = cs_b * NR; \
 	( void )jrstep_b; \
 \
@@ -1051,6 +1182,45 @@ void PASTEMAC(ch,varname) \
 	/* Compute number of primary and leftover components of the JC loop. */ \
 	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
 	const dim_t jc_left =   n_local % NC; \
+\
+	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
+	   needed for the matrix we will be packing (if any), but we do it
+	   unconditionally to be safe. An alternative way of initializing the
+	   mem_t entries is:
+
+	     bli_mem_clear( &mem_a ); \
+	     bli_mem_clear( &mem_b ); \
+	*/ \
+	mem_t mem_a = BLIS_MEM_INITIALIZER; \
+	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+\
+	/* Prepare the packing destination buffer. If packing is not requested for
+	   matrix A, this function will reduce to a no-op. */ \
+	PASTEMAC(ch,packm_sup_init_mem_a) \
+	( \
+	  packa, \
+	  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
+	  stor_id, \
+	  MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
+	  cntx, \
+	  rntm, \
+	  &mem_a, \
+	  thread  \
+	); \
+\
+	/* Prepare the packing destination buffer. If packing is not requested for
+	   matrix B, this function will reduce to a no-op. */ \
+	PASTEMAC(ch,packm_sup_init_mem_b) \
+	( \
+	  packb, \
+	  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
+	  stor_id, \
+	  KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
+	  cntx, \
+	  rntm, \
+	  &mem_b, \
+	  thread  \
+	); \
 \
 	/* Loop over the n dimension (NC rows/columns at a time). */ \
 	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -1151,6 +1321,39 @@ void PASTEMAC(ch,varname) \
 			/* Compute number of primary and leftover components of the IC loop. */ \
 			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
 			const dim_t ic_left =   m_local % MC; \
+\
+			ctype* b_use; \
+			inc_t  rs_b_use, cs_b_use, ps_b_use; \
+\
+			/* Determine the packing buffer and related parameters for matrix
+			   B. (If B will not be packed, then a_use will be set to point to
+			   b and the _b_use strides will be set accordingly.) Then call
+			   the packm sup variant chooser, which will call the appropriate
+			   implementation based on the schema deduced from the stor_id. */ \
+			PASTEMAC(ch,packm_sup_b) \
+			( \
+			  packb, \
+			  stor_id, \
+			  BLIS_NO_TRANSPOSE, \
+			  kc_cur, nc_cur, NR, \
+			  one, \
+			  b_pc,   rs_b,      cs_b, \
+			  &b_use, &rs_b_use, &cs_b_use, \
+			                     &ps_b_use, \
+			  cntx, \
+			  &mem_b, \
+			  thread  \
+			); \
+\
+			/* Alias a_use so that it's clear this is our current block of
+			   matrix B. */ \
+			ctype* restrict b_pc_use = b_use; \
+\
+			/* We don't need to embed the panel stride of B within the auxinfo_t
+			   object because this variant iterates through B in the jr loop,
+			   which occurs here, within the macrokernel, not within the
+			   millikernel. */ \
+			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
 \
 			/* Loop over the m dimension (MC rows at a time). */ \
 			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -1234,6 +1437,38 @@ void PASTEMAC(ch,varname) \
 				/* Compute the JR loop thread range for the current thread. */ \
 				dim_t jr_start, jr_end; \
 				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
+\
+				ctype* a_use; \
+				inc_t  rs_a_use, cs_a_use, ps_a_use; \
+\
+				/* Determine the packing buffer and related parameters for matrix
+				   A. (If A will not be packed, then a_use will be set to point to
+				   a and the _a_use strides will be set accordingly.) Then call
+				   the packm sup variant chooser, which will call the appropriate
+				   implementation based on the schema deduced from the stor_id. */ \
+				PASTEMAC(ch,packm_sup_a) \
+				( \
+				  packa, \
+				  stor_id, \
+				  BLIS_NO_TRANSPOSE, \
+				  mc_cur, kc_cur, MR, \
+				  one, \
+				  a_ic,   rs_a,      cs_a, \
+				  &a_use, &rs_a_use, &cs_a_use, \
+				                     &ps_a_use, \
+				  cntx, \
+				  &mem_a, \
+				  thread  \
+				); \
+\
+				/* Alias a_use so that it's clear this is our current block of
+				   matrix A. */ \
+				ctype* restrict a_ic_use = a_use; \
+\
+				/* Embed the panel stride of A within the auxinfo_t object. The
+				   millikernel will query and use this to iterate through
+				   micropanels of A (if needed). */ \
+				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
 \
 				/* Loop over the n dimension (NR columns at a time). */ \
 				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -1263,10 +1498,10 @@ void PASTEMAC(ch,varname) \
 						  nr_cur, \
 						  kc_cur, \
 						  alpha_cast, \
-						  a_ic, rs_a, cs_a, \
-						  b_jr, rs_b, cs_b, \
+						  a_ic_use, rs_a_use, cs_a_use, \
+						  b_jr,     rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr, rs_c, cs_c, \
+						  c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
--- a/frame/3/old/bli_l3_sup_var1n2m.c
+++ b/frame/3/old/bli_l3_sup_var1n2m.c
@@ -0,0 +1,821 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemmsup_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       stor3_t          eff_id,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       cntl_t* restrict cntl,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- var1n --------------------------------------------------------------------
+//
+
+static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
+
+void bli_gemmsup_ref_var1n
+     (
+       trans_t trans,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+#if 0
+	obj_t at, bt;
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_alias_to( b, &bt );
+
+	// Induce transpositions on A and/or B if either object is marked for
+	// transposition. We can induce "fast" transpositions since they objects
+	// are guaranteed to not have structure or be packed.
+	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
+	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
+
+	const num_t    dt_exec   = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+
+	const dim_t    k         = bli_obj_width( &at );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	const inc_t    rs_a      = bli_obj_row_stride( &at );
+	const inc_t    cs_a      = bli_obj_col_stride( &at );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	const inc_t    rs_b      = bli_obj_row_stride( &bt );
+	const inc_t    cs_b      = bli_obj_col_stride( &bt );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+
+#else
+
+	const num_t    dt_exec   = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	      dim_t    k;
+
+	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	      inc_t    rs_a;
+	      inc_t    cs_a;
+
+	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	      inc_t    rs_b;
+	      inc_t    cs_b;
+
+	if ( bli_obj_has_notrans( a ) )
+	{
+		k     = bli_obj_width( a );
+
+		rs_a  = bli_obj_row_stride( a );
+		cs_a  = bli_obj_col_stride( a );
+	}
+	else // if ( bli_obj_has_trans( a ) )
+	{
+		// Assign the variables with an implicit transposition.
+		k     = bli_obj_length( a );
+
+		rs_a  = bli_obj_col_stride( a );
+		cs_a  = bli_obj_row_stride( a );
+	}
+
+	if ( bli_obj_has_notrans( b ) )
+	{
+		rs_b  = bli_obj_row_stride( b );
+		cs_b  = bli_obj_col_stride( b );
+	}
+	else // if ( bli_obj_has_trans( b ) )
+	{
+		// Assign the variables with an implicit transposition.
+		rs_b  = bli_obj_col_stride( b );
+		cs_b  = bli_obj_row_stride( b );
+	}
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+
+#endif
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	FUNCPTR_T f = ftypes_var1n[dt_exec];
+
+	if ( bli_is_notrans( trans ) )
+	{
+		// Invoke the function.
+		f
+		(
+		  conja,
+		  conjb,
+		  m,
+		  n,
+		  k,
+		  buf_alpha,
+		  buf_a, rs_a, cs_a,
+		  buf_b, rs_b, cs_b,
+		  buf_beta,
+		  buf_c, rs_c, cs_c,
+		  eff_id,
+		  cntx,
+		  rntm,
+		  cntl,
+		  thread
+		);
+	}
+	else
+	{
+		// Invoke the function (transposing the operation).
+		f
+		(
+		  conjb,             // swap the conj values.
+		  conja,
+		  n,                 // swap the m and n dimensions.
+		  m,
+		  k,
+		  buf_alpha,
+		  buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  buf_beta,
+		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
+		  cntx,
+		  rntm,
+		  cntl,
+		  thread
+		);
+	}
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       stor3_t          stor_id, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       cntl_t* restrict cntl, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* If m or n is zero, return immediately. */ \
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* If k < 1 or alpha is zero, scale by beta and return. */ \
+	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
+	{ \
+		PASTEMAC(ch,scalm) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  0, \
+		  BLIS_NONUNIT_DIAG, \
+		  BLIS_DENSE, \
+		  m, n, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
+		return; \
+	} \
+\
+	const num_t dt  = PASTEMAC(ch,type); \
+\
+	/* This transposition of the stor3_t id value is inherent to variant 1.
+	   The reason: we assume that variant 2 is the "main" variant. The
+	   consequence of this is that we assume that the millikernels that
+	   iterate over m are registered to the kernel group associated with
+	   the kernel preference. So, regardless of whether the mkernels are
+	   row- or column-preferential, millikernels that iterate over n are
+	   always placed in the slots for the opposite kernel group. */ \
+	stor_id = bli_stor3_trans( stor_id ); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	dim_t KC; \
+	if      ( FALSE                  ) KC = KC0; \
+	else if ( stor_id == BLIS_RRC || \
+	          stor_id == BLIS_CRC    ) KC = KC0; \
+	else if ( m <=   MR && n <=   NR ) KC = KC0; \
+	else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
+	else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
+	else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
+	else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
+\
+	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
+\
+	/* Query the maximum blocksize for MR, which implies a maximum blocksize
+	   extension for the final iteration. */ \
+	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
+	const dim_t MRE = MRM - MR; \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = rs_c * NC; \
+	const inc_t jcstep_a = rs_a * NC; \
+\
+	const inc_t pcstep_a = cs_a * KC; \
+	const inc_t pcstep_b = rs_b * KC; \
+\
+	const inc_t icstep_c = cs_c * MC; \
+	const inc_t icstep_b = cs_b * MC; \
+\
+	const inc_t jrstep_c = rs_c * MR; \
+	const inc_t jrstep_a = rs_a * MR; \
+\
+	/*
+	const inc_t irstep_c = cs_c * NR; \
+	const inc_t irstep_b = cs_b * NR; \
+	*/ \
+\
+	/* Query the context for the sup microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemmsup_ker_ft) \
+               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+\
+	auxinfo_t       aux; \
+\
+	/* Compute number of primary and leftover components of the outer
+	   dimensions.
+	   NOTE: Functionally speaking, we compute jc_iter as:
+	     jc_iter = m / NC; if ( jc_left ) ++jc_iter;
+	   However, this is implemented as:
+	     jc_iter = ( m + NC - 1 ) / NC;
+	   This avoids a branch at the cost of two additional integer instructions.
+	   The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
+	   similar manner. */ \
+	const dim_t jc_iter = ( m + NC - 1 ) / NC; \
+	const dim_t jc_left =   m % NC; \
+\
+	const dim_t pc_iter = ( k + KC - 1 ) / KC; \
+	const dim_t pc_left =   k % KC; \
+\
+	const dim_t ic_iter = ( n + MC - 1 ) / MC; \
+	const dim_t ic_left =   n % MC; \
+\
+	const dim_t jc_inc  = 1; \
+	const dim_t pc_inc  = 1; \
+	const dim_t ic_inc  = 1; \
+	const dim_t jr_inc  = 1; \
+	/*
+	const dim_t ir_inc  = 1; \
+	*/ \
+\
+	/* Loop over the m dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
+	{ \
+		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
+\
+		ctype* restrict a_jc = a_00 + jj * jcstep_a; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
+		dim_t jr_left =   nc_cur % MR; \
+\
+		/* An optimization: allow the last jr iteration to contain up to MRE
+		   rows of C and A. (If MRE > MR, the mkernel has agreed to handle
+		   these cases.) Note that this prevents us from declaring jr_iter and
+		   jr_left as const. */ \
+		if ( 1 ) \
+		if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
+		{ \
+			jr_iter--; jr_left += MR; \
+		} \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
+		{ \
+			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_jc + pp * pcstep_a; \
+			ctype* restrict b_pc = b_00 + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+\
+			/* Loop over the n dimension (MC rows at a time). */ \
+			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
+			{ \
+				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
+\
+				ctype* restrict b_ic = b_pc + ii * icstep_b; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				/*
+				const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
+				const dim_t ir_left =   mc_cur % NR; \
+				*/ \
+\
+				/* Loop over the m dimension (NR columns at a time). */ \
+				for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
+				{ \
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
+\
+					ctype* restrict a_jr = a_pc + j * jrstep_a; \
+					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+\
+					/* Loop over the n dimension (MR rows at a time). */ \
+					{ \
+						/* Invoke the gemmsup millikernel. */ \
+						gemmsup_ker \
+						( \
+						  conja, \
+						  conjb, \
+						  nr_cur, /* Notice: nr_cur <= MR. */ \
+						  mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
+						  kc_cur, \
+						  alpha_cast, \
+						  a_jr, rs_a, cs_a, \
+						  b_ic, rs_b, cs_b, \
+						  beta_use, \
+						  c_jr, rs_c, cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
+					} \
+				} \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
+
+
+//
+// -- var2m --------------------------------------------------------------------
+//
+
+static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
+
+void bli_gemmsup_ref_var2m
+     (
+       trans_t trans,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+#if 0
+	obj_t at, bt;
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_alias_to( b, &bt );
+
+	// Induce transpositions on A and/or B if either object is marked for
+	// transposition. We can induce "fast" transpositions since they objects
+	// are guaranteed to not have structure or be packed.
+	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
+	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
+
+	const num_t    dt_exec   = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+
+	const dim_t    k         = bli_obj_width( &at );
+
+	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	const inc_t    rs_a      = bli_obj_row_stride( &at );
+	const inc_t    cs_a      = bli_obj_col_stride( &at );
+
+	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	const inc_t    rs_b      = bli_obj_row_stride( &bt );
+	const inc_t    cs_b      = bli_obj_col_stride( &bt );
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+
+#else
+	const num_t    dt_exec   = bli_obj_dt( c );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	      dim_t    k;
+
+	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	      inc_t    rs_a;
+	      inc_t    cs_a;
+
+	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	      inc_t    rs_b;
+	      inc_t    cs_b;
+
+	if ( bli_obj_has_notrans( a ) )
+	{
+		k     = bli_obj_width( a );
+
+		rs_a  = bli_obj_row_stride( a );
+		cs_a  = bli_obj_col_stride( a );
+	}
+	else // if ( bli_obj_has_trans( a ) )
+	{
+		// Assign the variables with an implicit transposition.
+		k     = bli_obj_length( a );
+
+		rs_a  = bli_obj_col_stride( a );
+		cs_a  = bli_obj_row_stride( a );
+	}
+
+	if ( bli_obj_has_notrans( b ) )
+	{
+		rs_b  = bli_obj_row_stride( b );
+		cs_b  = bli_obj_col_stride( b );
+	}
+	else // if ( bli_obj_has_trans( b ) )
+	{
+		// Assign the variables with an implicit transposition.
+		rs_b  = bli_obj_col_stride( b );
+		cs_b  = bli_obj_row_stride( b );
+	}
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+
+#endif
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	FUNCPTR_T f = ftypes_var2m[dt_exec];
+
+	if ( bli_is_notrans( trans ) )
+	{
+		// Invoke the function.
+		f
+		(
+		  conja,
+		  conjb,
+		  m,
+		  n,
+		  k,
+		  buf_alpha,
+		  buf_a, rs_a, cs_a,
+		  buf_b, rs_b, cs_b,
+		  buf_beta,
+		  buf_c, rs_c, cs_c,
+		  eff_id,
+		  cntx,
+		  rntm,
+		  cntl,
+		  thread
+		);
+	}
+	else
+	{
+		// Invoke the function (transposing the operation).
+		f
+		(
+		  conjb,             // swap the conj values.
+		  conja,
+		  n,                 // swap the m and n dimensions.
+		  m,
+		  k,
+		  buf_alpha,
+		  buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  buf_beta,
+		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
+		  cntx,
+		  rntm,
+		  cntl,
+		  thread
+		);
+	}
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       stor3_t          stor_id, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       cntl_t* restrict cntl, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	/* If m or n is zero, return immediately. */ \
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* If k < 1 or alpha is zero, scale by beta and return. */ \
+	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
+	{ \
+		PASTEMAC(ch,scalm) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  0, \
+		  BLIS_NONUNIT_DIAG, \
+		  BLIS_DENSE, \
+		  m, n, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
+		return; \
+	} \
+\
+	const num_t dt  = PASTEMAC(ch,type); \
+\
+	/* Query the context for various blocksizes. */ \
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	dim_t KC; \
+	if      ( stor_id == BLIS_RRR || \
+	          stor_id == BLIS_CCC    ) KC = KC0; \
+	else if ( stor_id == BLIS_RRC || \
+	          stor_id == BLIS_CRC    ) KC = KC0; \
+	else if ( m <=   MR && n <=   NR ) KC = KC0; \
+	else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
+	else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
+	else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
+	else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
+\
+	/* Query the maximum blocksize for NR, which implies a maximum blocksize
+	   extension for the final iteration. */ \
+	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
+	const dim_t NRE = NRM - NR; \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c * NC; \
+	const inc_t jcstep_b = cs_b * NC; \
+\
+	const inc_t pcstep_a = cs_a * KC; \
+	const inc_t pcstep_b = rs_b * KC; \
+\
+	const inc_t icstep_c = rs_c * MC; \
+	const inc_t icstep_a = rs_a * MC; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+	const inc_t jrstep_b = cs_b * NR; \
+\
+	/*
+	const inc_t irstep_c = rs_c * MR; \
+	const inc_t irstep_a = rs_a * MR; \
+	*/ \
+\
+	/* Query the context for the sup microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemmsup_ker_ft) \
+               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+\
+	auxinfo_t       aux; \
+\
+	/* Compute number of primary and leftover components of the outer
+	   dimensions.
+	   NOTE: Functionally speaking, we compute jc_iter as:
+	     jc_iter = n / NC; if ( jc_left ) ++jc_iter;
+	   However, this is implemented as:
+	     jc_iter = ( n + NC - 1 ) / NC;
+	   This avoids a branch at the cost of two additional integer instructions.
+	   The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
+	   similar manner. */ \
+	const dim_t jc_iter = ( n + NC - 1 ) / NC; \
+	const dim_t jc_left =   n % NC; \
+\
+	const dim_t pc_iter = ( k + KC - 1 ) / KC; \
+	const dim_t pc_left =   k % KC; \
+\
+	const dim_t ic_iter = ( m + MC - 1 ) / MC; \
+	const dim_t ic_left =   m % MC; \
+\
+	const dim_t jc_inc  = 1; \
+	const dim_t pc_inc  = 1; \
+	const dim_t ic_inc  = 1; \
+	const dim_t jr_inc  = 1; \
+	/*
+	const dim_t ir_inc  = 1; \
+	*/ \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
+	{ \
+		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+		dim_t jr_left =   nc_cur % NR; \
+\
+		/* An optimization: allow the last jr iteration to contain up to NRE
+		   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
+		   these cases.) Note that this prevents us from declaring jr_iter and
+		   jr_left as const. */ \
+		if ( 1 ) \
+		if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
+		{ \
+			jr_iter--; jr_left += NR; \
+		} \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
+		{ \
+			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
+			{ \
+				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				/*
+				const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+				const dim_t ir_left =   mc_cur % MR; \
+				*/ \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
+				{ \
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc + j * jrstep_b; \
+					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+\
+					/* Loop over the m dimension (MR rows at a time). */ \
+					{ \
+						/* Invoke the gemmsup millikernel. */ \
+						gemmsup_ker \
+						( \
+						  conja, \
+						  conjb, \
+						  mc_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  alpha_cast, \
+						  a_ic, rs_a, cs_a, \
+						  b_jr, rs_b, cs_b, \
+						  beta_use, \
+						  c_jr, rs_c, cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
+					} \
+				} \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
+
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -959,8 +959,7 @@ void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
 	// Process each operation id tuple provided.
 	for ( i = 0; i < n_ops; ++i )
 	{
-		// Read the current ukernel id, ukernel datatype, and ukernel function
-		// pointer.
+		// Read the current operation id and handler function pointer.
 		const opid_t op_id = op_ids[ i ];
 		      void*  op_fp = op_fps[ i ];

--- a/frame/base/bli_env.c
+++ b/frame/base/bli_env.c
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_env_get_var( const char* env, dim_t fallback )
+{
+	dim_t r_val;
+	char* str;
+
+	// Query the environment variable and store the result in str.
+	str = getenv( env );
+
+	// Set the return value based on the string obtained from getenv().
+	if ( str != NULL )
+	{
+		// If there was no error, convert the string to an integer and
+		// prepare to return that integer.
+		r_val = strtol( str, NULL, 10 );
+	}
+	else
+	{
+		// If there was an error, use the "fallback" as the return value.
+		r_val = fallback;
+	}
+
+	return r_val;
+}
+
+#if 0
+void bli_env_set_var( const char* env, dim_t value )
+{
+	dim_t       r_val;
+	char        value_str[32];
+	const char* fs_32 = "%u";
+	const char* fs_64 = "%lu";
+
+	// Convert the string to an integer, but vary the format specifier
+	// depending on the integer type size.
+	if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
+	else                                      sprintf( value_str, fs_64, value );
+
+	// Set the environment variable using the string we just wrote to via
+	// sprintf(). (The 'TRUE' argument means we want to overwrite the current
+	// value if the environment variable already exists.)
+	r_val = bli_setenv( env, value_str, TRUE );
+
+	// Check the return value in case something went horribly wrong.
+	if ( r_val == -1 )
+	{
+		char err_str[128];
+
+		// Query the human-readable error string corresponding to errno.
+		strerror_r( errno, err_str, 128 );
+
+		// Print the error message.
+		bli_print_msg( err_str, __FILE__, __LINE__ );
+	}
+}
+#endif
+
--- a/frame/base/bli_env.h
+++ b/frame/base/bli_env.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_ENV_H
+#define BLIS_ENV_H
+
+dim_t bli_env_get_var( const char* env, dim_t fallback );
+//void  bli_env_set_var( const char* env, dim_t value );
+
+#endif
+
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -34,11 +34,32 @@

 */

+
 #ifndef BLIS_MEM_H
 #define BLIS_MEM_H


-// Mem entry query
+// mem_t object type (defined in bli_type_defs.h)
+
+/*
+typedef struct mem_s
+{
+	pblk_t    pblk;
+	packbuf_t buf_type;
+	pool_t*   pool;
+	siz_t     size;
+} mem_t;
+
+typedef struct
+{
+	void*     buf;
+	siz_t     block_size;
+} pblk_t;
+*/
+
+//
+// -- mem_t query --------------------------------------------------------------
+//

 BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
 {
@@ -78,7 +99,9 @@ BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
 }


-// Mem entry modification
+//
+// -- mem_t modification -------------------------------------------------------
+//

 BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
 {
--- a/frame/base/bli_pack.c
+++ b/frame/base/bli_pack.c
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// The global rntm_t structure. (The definition resides in bli_rntm.c.)
+extern rntm_t global_rntm;
+
+// A mutex to allow synchronous access to global_rntm. (The definition
+// resides in bli_rntm.c.)
+extern bli_pthread_mutex_t global_rntm_mutex;
+
+// -----------------------------------------------------------------------------
+
+void bli_pack_init( void )
+{
+	// Read the environment variables and use them to initialize the
+	// global runtime object.
+	bli_pack_init_rntm_from_env( &global_rntm );
+}
+
+void bli_pack_finalize( void )
+{
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_pack_get_pack_a( void )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	return bli_rntm_pack_a( &global_rntm );
+}
+
+// -----------------------------------------------------------------------------
+
+dim_t bli_pack_get_pack_b( void )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	return bli_rntm_pack_b( &global_rntm );
+}
+
+// ----------------------------------------------------------------------------
+
+void bli_pack_set_pack_a( bool_t pack_a )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	// Acquire the mutex protecting global_rntm.
+	bli_pthread_mutex_lock( &global_rntm_mutex );
+
+	bli_rntm_set_pack_a( pack_a, &global_rntm );
+
+	// Release the mutex protecting global_rntm.
+	bli_pthread_mutex_unlock( &global_rntm_mutex );
+}
+
+// ----------------------------------------------------------------------------
+
+void bli_pack_set_pack_b( bool_t pack_b )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	// Acquire the mutex protecting global_rntm.
+	bli_pthread_mutex_lock( &global_rntm_mutex );
+
+	bli_rntm_set_pack_a( pack_b, &global_rntm );
+
+	// Release the mutex protecting global_rntm.
+	bli_pthread_mutex_unlock( &global_rntm_mutex );
+}
+
+// ----------------------------------------------------------------------------
+
+void bli_pack_init_rntm_from_env
+     (
+       rntm_t* rntm
+     )
+{
+	// NOTE: We don't need to acquire the global_rntm_mutex here because this
+	// function is only called from bli_pack_init(), which is only called
+	// by bli_init_once().
+
+	bool_t pack_a;
+	bool_t pack_b;
+
+#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
+
+	// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
+	// -1 if it is unset.
+	pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
+	pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
+
+	// Enforce the default behavior first, then check for affirmative FALSE, and
+	// finally assume anything else is TRUE.
+	if      ( pack_a == -1 ) pack_a = FALSE; // default behavior
+	else if ( pack_a ==  0 ) pack_a = FALSE; // zero is FALSE
+	else                     pack_a = TRUE;  // anything else is TRUE
+
+	if      ( pack_b == -1 ) pack_b = FALSE; // default behavior
+	else if ( pack_b ==  0 ) pack_b = FALSE; // zero is FALSE
+	else                     pack_b = TRUE;  // anything else is TRUE
+
+#else
+
+	pack_a = TRUE;
+	pack_b = TRUE;
+
+#endif
+
+	// Save the results back in the runtime object.
+	bli_rntm_set_pack_a( pack_a, rntm );
+	bli_rntm_set_pack_b( pack_b, rntm );
+
+#if 0
+	printf( "bli_pack_init_rntm_from_env()\n" );
+	bli_rntm_print( rntm );
+#endif
+}
+
--- a/frame/base/bli_pack.h
+++ b/frame/base/bli_pack.h
@@ -0,0 +1,49 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_PACK_H
+#define BLIS_PACK_H
+
+void  bli_pack_init( void );
+void  bli_pack_finalize( void );
+
+BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
+BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
+BLIS_EXPORT_BLIS void  bli_pack_set_pack_a( bool_t pack_a );
+BLIS_EXPORT_BLIS void  bli_pack_set_pack_b( bool_t pack_b );
+
+void  bli_pack_init_rntm_from_env( rntm_t* rntm );
+
+#endif
+
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -34,6 +34,29 @@

 #include "blis.h"

+// The global rntm_t structure, which holds the global thread settings
+// along with a few other key parameters.
+rntm_t global_rntm;
+
+// A mutex to allow synchronous access to global_rntm.
+bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
+
+// ----------------------------------------------------------------------------
+
+void bli_rntm_init_from_global( rntm_t* rntm )
+{
+	// We must ensure that global_rntm has been initialized.
+	bli_init_once();
+
+	// Acquire the mutex protecting global_rntm.
+	bli_pthread_mutex_lock( &global_rntm_mutex );
+
+	*rntm = global_rntm;
+
+	// Release the mutex protecting global_rntm.
+	bli_pthread_mutex_unlock( &global_rntm_mutex );
+}
+
 // -----------------------------------------------------------------------------

 void bli_rntm_set_ways_for_op
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -52,11 +52,8 @@ typedef struct rntm_s
 	bool      l3_sup;

 	pool_t*   sba_pool;
-
 	membrk_t* membrk;

-	bool_t    l3_sup;
-
 } rntm_t;
 */

@@ -229,10 +226,6 @@ BLIS_INLINE void bli_rntm_clear_membrk( rntm_t* rntm )
 {
 	bli_rntm_set_membrk( NULL, rntm );
 }
-static void bli_rntm_clear_l3_sup( rntm_t* rntm )
-{
-	bli_rntm_set_l3_sup( 1, rntm );
-}

 //
 // -- rntm_t modification (public API) -----------------------------------------
@@ -321,7 +314,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
          .l3_sup      = TRUE, \
          .sba_pool    = NULL, \
          .membrk      = NULL, \
-          .l3_sup      = 1  \
        }  \

 BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
@@ -330,11 +322,12 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )

 	bli_rntm_clear_num_threads_only( rntm );
 	bli_rntm_clear_ways_only( rntm );
+	bli_rntm_clear_pack_a( rntm );
+	bli_rntm_clear_pack_b( rntm );
+	bli_rntm_clear_l3_sup( rntm );

 	bli_rntm_clear_sba_pool( rntm );
 	bli_rntm_clear_membrk( rntm );
-
-	bli_rntm_clear_l3_sup( rntm );
 }

 // -- rntm_t total thread calculation ------------------------------------------
@@ -359,6 +352,8 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads

 // Function prototypes

+BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
+
 BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
     (
       opid_t  l3_op,
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1185,6 +1185,13 @@ typedef struct
 	inc_t  is_a;
 	inc_t  is_b;

+	// The panel strides of A and B.
+	// NOTE: These are only used in situations where iteration over the
+	// micropanels takes place in part within the kernel code (e.g. sup
+	// millikernels).
+	inc_t  ps_a;
+	inc_t  ps_b;
+
 	// The type to convert to on output.
 	//num_t  dt_on_output;

@@ -1441,6 +1448,9 @@ typedef struct cntx_s

 // -- Runtime type --

+// NOTE: The order of these fields must be kept consistent with the definition
+// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h.
+
 typedef struct rntm_s
 {
 	// "External" fields: these may be queried by the end-user.
@@ -1460,9 +1470,6 @@ typedef struct rntm_s
 	// The packing block allocator, which is attached in the l3 thread decorator.
 	membrk_t* membrk;

-	// A switch to enable/disable small/unpacked matrix handling in level-3 ops.
-	bool_t    l3_sup;
-
 } rntm_t;


--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -130,6 +130,8 @@ extern "C" {
 #include "bli_getopt.h"
 #include "bli_opid.h"
 #include "bli_cntl.h"
+#include "bli_env.h"
+#include "bli_pack.h"
 #include "bli_info.h"
 #include "bli_arch.h"
 #include "bli_cpuid.h"
--- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
+++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
@@ -98,8 +98,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Some induced methods execute in multiple "stages". */ \
 	for ( i = 0; i < nstage; ++i ) \
@@ -191,8 +191,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Some induced methods execute in multiple "stages". */ \
 	for ( i = 0; i < nstage; ++i ) \
@@ -282,8 +282,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Some induced methods execute in multiple "stages". */ \
 	for ( i = 0; i < nstage; ++i ) \
@@ -358,8 +358,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Some induced methods execute in multiple "stages". */ \
 	for ( i = 0; i < nstage; ++i ) \
@@ -420,8 +420,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	{ \
 		/* NOTE: trsm cannot be implemented via any induced method that
--- a/frame/ind/oapi/bli_l3_ind_oapi.c
+++ b/frame/ind/oapi/bli_l3_ind_oapi.c
@@ -60,8 +60,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	func( alpha, a, b, beta, c, cntx, rntm ); \
 }
@@ -97,8 +97,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	func( side, alpha, a, b, beta, c, cntx, rntm ); \
 }
@@ -131,8 +131,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	func( alpha, a, beta, c, cntx, rntm ); \
 }
@@ -164,8 +164,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	func( side, alpha, a, b, cntx, rntm ); \
 }
--- a/frame/ind/oapi/bli_l3_nat_oapi.c
+++ b/frame/ind/oapi/bli_l3_nat_oapi.c
@@ -66,8 +66,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Invoke the operation's front end. */ \
 	PASTEMAC(opname,_front) \
@@ -112,8 +112,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Invoke the operation's front end. */ \
 	PASTEMAC(opname,_front) \
@@ -150,8 +150,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Invoke the operation's front end. */ \
 	PASTEMAC(opname,_front) \
@@ -187,8 +187,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Invoke the operation's front end. */ \
 	PASTEMAC(opname,_front) \
@@ -223,8 +223,8 @@ void PASTEMAC(opname,imeth) \
 	/* Initialize a local runtime with global settings if necessary. Note
 	   that in the case that a runtime is passed in, we make a local copy. */ \
 	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                  rntm = &rntm_l; } \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
 \
 	/* Invoke the operation's front end. */ \
 	PASTEMAC(opname,_front) \
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/thread/bli_l3_decor.h
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_DECOR_H
+#define BLIS_L3_DECOR_H
+
+// -- conventional definitions -------------------------------------------------
+
+// Level-3 internal function type.
+typedef void (*l3int_t)
+     (
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     );
+
+// Level-3 thread decorator prototype.
+void bli_l3_thread_decorator
+     (
+       l3int_t func,
+       opid_t  family,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     );
+
+// Include definitions specific to the method of multithreading for the
+// conventional code path.
+#include "bli_l3_decor_single.h"
+#include "bli_l3_decor_openmp.h"
+#include "bli_l3_decor_pthreads.h"
+
+#endif
+
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -0,0 +1,248 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_OPENMP
+
+// Define a dummy function bli_l3_thread_entry(), which is needed in the
+// pthreads version, so that when building Windows DLLs (with OpenMP enabled
+// or no multithreading) we don't risk having an unresolved symbol.
+void* bli_l3_thread_entry( void* data_void ) { return NULL; }
+
+//#define PRINT_THRINFO
+
+void bli_l3_thread_decorator
+     (
+       l3int_t    func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl
+     )
+{
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( a );
+	pack_t schema_b = bli_obj_pack_schema( b );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// Query the total number of threads from the rntm_t object.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	#ifdef PRINT_THRINFO
+	thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
+	#endif
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Create a thread-local copy of the master thread's rntm_t. This is
+		// necessary since we want each thread to be able to track its own
+		// small block pool_t as it executes down the function stack.
+		rntm_t           rntm_l = *rntm;
+		rntm_t* restrict rntm_p = &rntm_l;
+
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
+
+		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
+		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+
+		obj_t      a_t, b_t, c_t;
+		cntl_t*    cntl_use;
+		thrinfo_t* thread;
+
+		// Alias thread-local copies of A, B, and C. These will be the objects
+		// we pass down the algorithmic function stack. Making thread-local
+		// alaises is highly recommended in case a thread needs to change any
+		// of the properties of an object without affecting other threads'
+		// objects.
+		bli_obj_alias_to( a, &a_t );
+		bli_obj_alias_to( b, &b_t );
+		bli_obj_alias_to( c, &c_t );
+
+		// Create a default control tree for the operation, if needed.
+		bli_l3_cntl_create_if( family, schema_a, schema_b,
+		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
+
+		// Create the root node of the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+
+#if 1
+		func
+		(
+		  alpha,
+		  &a_t,
+		  &b_t,
+		  beta,
+		  &c_t,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+#else
+		bli_thrinfo_grow_tree
+		(
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+#endif
+
+		// Free the thread's local control tree.
+		bli_l3_cntl_free( rntm_p, cntl_use, thread );
+
+		#ifdef PRINT_THRINFO
+		threads[tid] = thread;
+		#else
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+		#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	#ifdef PRINT_THRINFO
+	if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
+	else                       bli_l3_thrinfo_print_trsm_paths( threads );
+	exit(1);
+	#endif
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_l3_thread_decorator_thread_check
+     (
+       dim_t      n_threads,
+       dim_t      tid,
+       thrcomm_t* gl_comm,
+       rntm_t*    rntm
+     )
+{
+	dim_t n_threads_real = omp_get_num_threads();
+
+	// Check if the number of OpenMP threads created within this parallel
+	// region is different from the number of threads that were requested
+	// of BLIS. This inequality may trigger when, for example, the
+	// following conditions are satisfied:
+	// - an application is executing an OpenMP parallel region in which
+	//   BLIS is invoked,
+	// - BLIS is configured for multithreading via OpenMP,
+	// - OMP_NUM_THREADS = t > 1,
+	// - the number of threads requested of BLIS (regardless of method)
+	//   is p <= t,
+	// - OpenMP nesting is disabled.
+	// In this situation, the application spawns t threads. Each application
+	// thread calls gemm (for example). Each gemm will attempt to spawn p
+	// threads via OpenMP. However, since nesting is disabled, the OpenMP
+	// implementation finds that t >= p threads are already spawned, and
+	// thus it doesn't spawn *any* additional threads for each gemm.
+	if ( n_threads_real != n_threads )
+	{
+		// If the number of threads active in the current region is not
+		// equal to the number requested of BLIS, we then only continue
+		// if the number of threads in the current region is 1. If, for
+		// example, BLIS requested 4 threads but only got 3, then we
+		// abort().
+		//if ( tid == 0 )
+		//{
+			if ( n_threads_real != 1 )
+			{
+				bli_print_msg( "A different number of threads was "
+				               "created than was requested.",
+				               __FILE__, __LINE__ );
+				bli_abort();
+			}
+
+			//n_threads = 1; // not needed since it has no effect?
+			bli_thrcomm_init( 1, gl_comm );
+			bli_rntm_set_num_threads_only( 1, rntm );
+			bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
+		//}
+
+		// Synchronize all threads and continue.
+		_Pragma( "omp barrier" )
+	}
+}
+
+#endif
+
--- a/frame/thread/bli_l3_decor_openmp.h
+++ b/frame/thread/bli_l3_decor_openmp.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_DECOR_OPENMP_H
+#define BLIS_L3_DECOR_OPENMP_H
+
+// Definitions specific to situations when OpenMP multithreading is enabled.
+#ifdef BLIS_ENABLE_OPENMP
+
+void bli_l3_thread_decorator_thread_check
+     (
+       dim_t      n_threads,
+       dim_t      tid,
+	   thrcomm_t* gl_comm,
+       rntm_t*    rntm
+     );
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -0,0 +1,252 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_PTHREADS
+
+// A data structure to assist in passing operands to additional threads.
+typedef struct thread_data
+{
+	l3int_t    func;
+	opid_t     family;
+	pack_t     schema_a;
+	pack_t     schema_b;
+	obj_t*     alpha;
+	obj_t*     a;
+	obj_t*     b;
+	obj_t*     beta;
+	obj_t*     c;
+	cntx_t*    cntx;
+	rntm_t*    rntm;
+	cntl_t*    cntl;
+	dim_t      tid;
+	thrcomm_t* gl_comm;
+	array_t*   array;
+} thread_data_t;
+
+// Entry point for additional threads
+void* bli_l3_thread_entry( void* data_void )
+{
+	thread_data_t* data     = data_void;
+
+	l3int_t        func     = data->func;
+	opid_t         family   = data->family;
+	pack_t         schema_a = data->schema_a;
+	pack_t         schema_b = data->schema_b;
+	obj_t*         alpha    = data->alpha;
+	obj_t*         a        = data->a;
+	obj_t*         b        = data->b;
+	obj_t*         beta     = data->beta;
+	obj_t*         c        = data->c;
+	cntx_t*        cntx     = data->cntx;
+	rntm_t*        rntm     = data->rntm;
+	cntl_t*        cntl     = data->cntl;
+	dim_t          tid      = data->tid;
+	array_t*       array    = data->array;
+	thrcomm_t*     gl_comm  = data->gl_comm;
+
+	// Create a thread-local copy of the master thread's rntm_t. This is
+	// necessary since we want each thread to be able to track its own
+	// small block pool_t as it executes down the function stack.
+	rntm_t           rntm_l = *rntm;
+	rntm_t* restrict rntm_p = &rntm_l;
+
+	// Use the thread id to access the appropriate pool_t* within the
+	// array_t, and use it to set the sba_pool field within the rntm_t.
+	// If the pool_t* element within the array_t is NULL, it will first
+	// be allocated/initialized.
+	bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+	obj_t          a_t, b_t, c_t;
+	cntl_t*        cntl_use;
+	thrinfo_t*     thread;
+
+	// Alias thread-local copies of A, B, and C. These will be the objects
+	// we pass down the algorithmic function stack. Making thread-local
+	// alaises is highly recommended in case a thread needs to change any
+	// of the properties of an object without affecting other threads'
+	// objects.
+	bli_obj_alias_to( a, &a_t );
+	bli_obj_alias_to( b, &b_t );
+	bli_obj_alias_to( c, &c_t );
+
+	// Create a default control tree for the operation, if needed.
+	bli_l3_cntl_create_if( family, schema_a, schema_b,
+	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
+
+	// Create the root node of the current thread's thrinfo_t structure.
+	bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+
+	func
+	(
+	  alpha,
+	  &a_t,
+	  &b_t,
+	  beta,
+	  &c_t,
+	  cntx,
+	  rntm_p,
+	  cntl_use,
+	  thread
+	);
+
+	// Free the thread's local control tree.
+	bli_l3_cntl_free( rntm_p, cntl_use, thread );
+
+	// Free the current thread's thrinfo_t structure.
+	bli_l3_thrinfo_free( rntm_p, thread );
+
+	return NULL;
+}
+
+void bli_l3_thread_decorator
+     (
+       l3int_t    func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl
+     )
+{
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( a );
+	pack_t schema_b = bli_obj_pack_schema( b );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// Query the total number of threads from the context.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+	// Allocate an array of pthread objects and auxiliary data structs to pass
+	// to the thread entry functions.
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
+
+	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
+	// can spawn all other threads before proceeding with its own computation.
+	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
+	{
+		// Set up thread data for additional threads (beyond thread 0).
+		datas[tid].func     = func;
+		datas[tid].family   = family;
+		datas[tid].schema_a = schema_a;
+		datas[tid].schema_b = schema_b;
+		datas[tid].alpha    = alpha;
+		datas[tid].a        = a;
+		datas[tid].b        = b;
+		datas[tid].beta     = beta;
+		datas[tid].c        = c;
+		datas[tid].cntx     = cntx;
+		datas[tid].rntm     = rntm;
+		datas[tid].cntl     = cntl;
+		datas[tid].tid      = tid;
+		datas[tid].gl_comm  = gl_comm;
+		datas[tid].array    = array;
+
+		// Spawn additional threads for ids greater than 1.
+		if ( tid != 0 )
+			bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
+		else
+			bli_l3_thread_entry( ( void* )(&datas[0]) );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called from the thread entry function).
+
+	// Thread 0 waits for additional threads to finish.
+	for ( dim_t tid = 1; tid < n_threads; tid++ )
+	{
+		bli_pthread_join( pthreads[tid], NULL );
+	}
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( pthreads );
+
+	#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_l3_thread_decorator().pth: " );
+	#endif
+	bli_free_intl( datas );
+}
+
+#endif
+
--- a/frame/thread/bli_l3_decor_pthreads.h
+++ b/frame/thread/bli_l3_decor_pthreads.h
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_DECOR_PTHREADS_H
+#define BLIS_L3_DECOR_PTHREADS_H
+
+// Definitions specific to situations when POSIX multithreading is enabled.
+#ifdef BLIS_ENABLE_PTHREADS
+
+// Thread entry point prototype.
+void* bli_l3_thread_entry( void* data_void );
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+void bli_l3_thread_decorator
+     (
+       l3int_t    func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl
+     )
+{
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( a );
+	pack_t schema_b = bli_obj_pack_schema( b );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we can create the global comm below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		cntl_t*    cntl_use;
+		thrinfo_t* thread;
+
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
+		// need to alias objects for A, B, and C since they were already aliased
+		// in bli_*_front(). However, we may add aliasing here in the future so
+		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
+		// consistently providing local aliases, we can then eliminate aliasing
+		// elsewhere.
+
+		// Create a default control tree for the operation, if needed.
+		bli_l3_cntl_create_if( family, schema_a, schema_b,
+		                       a, b, c, rntm_p, cntl, &cntl_use );
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+
+		// Free the thread's local control tree.
+		bli_l3_cntl_free( rntm_p, cntl_use, thread );
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+#endif
+
--- a/frame/thread/bli_l3_decor_single.h
+++ b/frame/thread/bli_l3_decor_single.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_DECOR_SINGLE_H
+#define BLIS_L3_DECOR_SINGLE_H
+
+// Definitions specific to situations when multithreading is disabled.
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_DECOR_H
+#define BLIS_L3_SUP_DECOR_H
+
+// -- sup definitions ----------------------------------------------------------
+
+// Level-3 sup internal function type.
+typedef err_t (*l3supint_t)
+     (
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
+       thrinfo_t* thread
+     );
+
+// Level-3 sup thread decorator prototype.
+err_t bli_l3_sup_thread_decorator
+     (
+       l3supint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     );
+
+// Include definitions specific to the method of multithreading for the
+// sup code path.
+#include "bli_l3_sup_decor_single.h"
+#include "bli_l3_sup_decor_openmp.h"
+#include "bli_l3_sup_decor_pthreads.h"
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -0,0 +1,190 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_OPENMP
+
+// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
+// pthreads version, so that when building Windows DLLs (with OpenMP enabled
+// or no multithreading) we don't risk having an unresolved symbol.
+//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
+
+
+
+err_t bli_l3_sup_thread_decorator
+     (
+       l3supint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+#if 0
+
+	return
+	bli_gemmsup_int
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  0
+	);
+
+#else
+
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	//pack_t schema_a = bli_obj_pack_schema( a );
+	//pack_t schema_b = bli_obj_pack_schema( b );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we can create the global comm below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_membrk_rntm_set_membrk( rntm );
+
+#if 0
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+#endif
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		cntl_t*    cntl_use = NULL;
+		//thrinfo_t* thread   = NULL;
+		thrinfo_t* thread   = &BLIS_PACKM_SINGLE_THREADED;
+
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
+		// need to alias objects for A, B, and C since they were already aliased
+		// in bli_*_front(). However, we may add aliasing here in the future so
+		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
+		// consistently providing local aliases, we can then eliminate aliasing
+		// elsewhere.
+
+		// Create a default control tree for the operation, if needed.
+		//bli_l3_cntl_create_if( family, schema_a, schema_b,
+		//                       a, b, c, rntm_p, cntl, &cntl_use );
+#if 0
+		cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+#endif
+
+		( void )tid;
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+
+#if 0
+		// Free the thread's local control tree.
+		//bli_l3_cntl_free( rntm_p, cntl_use, thread );
+		bli_gemm_cntl_free( rntm_p, cntl_use, thread );
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	return BLIS_SUCCESS;
+
+#endif
+}
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_openmp.h
+++ b/frame/thread/bli_l3_sup_decor_openmp.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
+#define BLIS_L3_SUP_DECOR_OPENMP_H
+
+// Definitions specific to situations when OpenMP multithreading is enabled.
+#ifdef BLIS_ENABLE_OPENMP
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -0,0 +1,183 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_PTHREADS
+
+err_t bli_l3_sup_thread_decorator
+     (
+       l3supint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+#if 0
+
+	return
+	bli_gemmsup_int
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  0
+	);
+
+#else
+
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	//pack_t schema_a = bli_obj_pack_schema( a );
+	//pack_t schema_b = bli_obj_pack_schema( b );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we can create the global comm below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_membrk_rntm_set_membrk( rntm );
+
+#if 0
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+#endif
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		cntl_t*    cntl_use = NULL;
+		//thrinfo_t* thread   = NULL;
+		thrinfo_t* thread   = &BLIS_PACKM_SINGLE_THREADED;
+
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
+		// need to alias objects for A, B, and C since they were already aliased
+		// in bli_*_front(). However, we may add aliasing here in the future so
+		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
+		// consistently providing local aliases, we can then eliminate aliasing
+		// elsewhere.
+
+		// Create a default control tree for the operation, if needed.
+		//bli_l3_cntl_create_if( family, schema_a, schema_b,
+		//                       a, b, c, rntm_p, cntl, &cntl_use );
+#if 0
+		cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+#endif
+
+		( void )tid;
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+
+#if 0
+		// Free the thread's local control tree.
+		//bli_l3_cntl_free( rntm_p, cntl_use, thread );
+		bli_gemm_cntl_free( rntm_p, cntl_use, thread );
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	return BLIS_SUCCESS;
+
+#endif
+}
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_pthreads.h
+++ b/frame/thread/bli_l3_sup_decor_pthreads.h
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H
+#define BLIS_L3_SUP_DECOR_PTHREADS_H
+
+// Definitions specific to situations when POSIX multithreading is enabled.
+#ifdef BLIS_ENABLE_PTHREADS
+
+// Thread entry point prototype.
+void* bli_l3_sup_thread_entry( void* data_void );
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ b/frame/thread/bli_l3_sup_decor_single.c
@@ -0,0 +1,183 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+err_t bli_l3_sup_thread_decorator
+     (
+       l3supint_t func,
+       opid_t     family,
+       //pack_t     schema_a,
+       //pack_t     schema_b,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm
+     )
+{
+#if 0
+
+	return
+	bli_gemmsup_int
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm,
+	  0
+	);
+
+#else
+
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	//pack_t schema_a = bli_obj_pack_schema( a );
+	//pack_t schema_b = bli_obj_pack_schema( b );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// For sequential execution, we use only one thread.
+	const dim_t n_threads = 1;
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we can create the global comm below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm.
+	bli_membrk_rntm_set_membrk( rntm );
+
+#if 0
+	// Allcoate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+#endif
+
+
+	{
+		// NOTE: We don't need to create another copy of the rntm_t since
+		// it was already copied in one of the high-level oapi functions.
+		rntm_t* restrict rntm_p = rntm;
+
+		cntl_t*    cntl_use = NULL;
+		//thrinfo_t* thread   = NULL;
+		thrinfo_t* thread   = &BLIS_PACKM_SINGLE_THREADED;
+
+		const dim_t tid = 0;
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		// NOTE: This is commented out because, in the single-threaded case,
+		// this is redundant since it's already been done above.
+		//bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
+		// need to alias objects for A, B, and C since they were already aliased
+		// in bli_*_front(). However, we may add aliasing here in the future so
+		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
+		// consistently providing local aliases, we can then eliminate aliasing
+		// elsewhere.
+
+		// Create a default control tree for the operation, if needed.
+		//bli_l3_cntl_create_if( family, schema_a, schema_b,
+		//                       a, b, c, rntm_p, cntl, &cntl_use );
+#if 0
+		cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
+
+		// Create the root node of the thread's thrinfo_t structure.
+		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+#endif
+
+		( void )tid;
+
+		func
+		(
+		  alpha,
+		  a,
+		  b,
+		  beta,
+		  c,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+
+#if 0
+		// Free the thread's local control tree.
+		//bli_l3_cntl_free( rntm_p, cntl_use, thread );
+		bli_gemm_cntl_free( rntm_p, cntl_use, thread );
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+#endif
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+
+	return BLIS_SUCCESS;
+
+#endif
+}
+
+#endif
+
--- a/frame/thread/bli_l3_sup_decor_single.h
+++ b/frame/thread/bli_l3_sup_decor_single.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_L3_SUP_DECOR_SINGLE_H
+#define BLIS_L3_SUP_DECOR_SINGLE_H
+
+// Definitions specific to situations when multithreading is disabled.
+#ifndef BLIS_ENABLE_MULTITHREADING
+
+#endif
+
+#endif
+
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -43,10 +43,6 @@
 #include "bli_thrcomm_pthreads.h"


-// thread entry point prototype.
-void* bli_l3_thread_entry( void* data_void );
-
-
 // thrcomm_t query (field only)

 BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -214,212 +214,5 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )

 #endif

-
-// Define a dummy function bli_l3_thread_entry(), which is needed in the
-// pthreads version, so that when building Windows DLLs (with OpenMP enabled
-// or no multithreading) we don't risk having an unresolved symbol.
-void* bli_l3_thread_entry( void* data_void ) { return NULL; }
-
-//#define PRINT_THRINFO
-
-void bli_l3_thread_decorator
-     (
-       l3int_t     func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm,
-       cntl_t*     cntl
-     )
-{
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
-	// Query the total number of threads from the rntm_t object.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	#ifdef PRINT_THRINFO
-	thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
-	#endif
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_membrk_rntm_set_membrk( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-
-	_Pragma( "omp parallel num_threads(n_threads)" )
-	{
-		// Create a thread-local copy of the master thread's rntm_t. This is
-		// necessary since we want each thread to be able to track its own
-		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
-
-		// Query the thread's id from OpenMP.
-		const dim_t tid = omp_get_thread_num();
-
-		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-
-		obj_t      a_t, b_t, c_t;
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
-		// Alias thread-local copies of A, B, and C. These will be the objects
-		// we pass down the algorithmic function stack. Making thread-local
-		// alaises is highly recommended in case a thread needs to change any
-		// of the properties of an object without affecting other threads'
-		// objects.
-		bli_obj_alias_to( a, &a_t );
-		bli_obj_alias_to( b, &b_t );
-		bli_obj_alias_to( c, &c_t );
-
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
-		// Create the root node of the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-#if 1
-		func
-		(
-		  alpha,
-		  &a_t,
-		  &b_t,
-		  beta,
-		  &c_t,
-		  cntx,
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-#else
-		bli_thrinfo_grow_tree
-		(
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
 #endif

-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-		#ifdef PRINT_THRINFO
-		threads[tid] = thread;
-		#else
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-		#endif
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	#ifdef PRINT_THRINFO
-	if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
-	else                       bli_l3_thrinfo_print_trsm_paths( threads );
-	exit(1);
-	#endif
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_l3_thread_decorator_thread_check
-     (
-       dim_t      n_threads,
-       dim_t      tid,
-       thrcomm_t* gl_comm,
-       rntm_t*    rntm
-     )
-{
-	dim_t n_threads_real = omp_get_num_threads();
-
-	// Check if the number of OpenMP threads created within this parallel
-	// region is different from the number of threads that were requested
-	// of BLIS. This inequality may trigger when, for example, the
-	// following conditions are satisfied:
-	// - an application is executing an OpenMP parallel region in which
-	//   BLIS is invoked,
-	// - BLIS is configured for multithreading via OpenMP,
-	// - OMP_NUM_THREADS = t > 1,
-	// - the number of threads requested of BLIS (regardless of method)
-	//   is p <= t,
-	// - OpenMP nesting is disabled.
-	// In this situation, the application spawns t threads. Each application
-	// thread calls gemm (for example). Each gemm will attempt to spawn p
-	// threads via OpenMP. However, since nesting is disabled, the OpenMP
-	// implementation finds that t >= p threads are already spawned, and
-	// thus it doesn't spawn *any* additional threads for each gemm.
-	if ( n_threads_real != n_threads )
-	{
-		// If the number of threads active in the current region is not
-		// equal to the number requested of BLIS, we then only continue
-		// if the number of threads in the current region is 1. If, for
-		// example, BLIS requested 4 threads but only got 3, then we
-		// abort().
-		//if ( tid == 0 )
-		//{
-			if ( n_threads_real != 1 )
-			{
-				bli_print_msg( "A different number of threads was "
-				               "created than was requested.",
-				               __FILE__, __LINE__ );
-				bli_abort();
-			}
-
-			//n_threads = 1; // not needed since it has no effect?
-			bli_thrcomm_init( 1, gl_comm );
-			bli_rntm_set_num_threads_only( 1, rntm );
-			bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
-		//}
-
-		// Synchronize all threads and continue.
-		_Pragma( "omp barrier" )
-	}
-}
-
-#endif
--- a/frame/thread/bli_thrcomm_openmp.h
+++ b/frame/thread/bli_thrcomm_openmp.h
@@ -85,14 +85,6 @@ void        bli_thrcomm_tree_barrier_free( barrier_t* barrier );
 void        bli_thrcomm_tree_barrier( barrier_t* barack );
 #endif

-void bli_l3_thread_decorator_thread_check
-     (
-       dim_t      n_threads,
-       dim_t      tid,
-	   thrcomm_t* gl_comm,
-       rntm_t*    rntm
-     );
-
 #endif

 #endif
--- a/frame/thread/bli_thrcomm_pthreads.c
+++ b/frame/thread/bli_thrcomm_pthreads.c
@@ -138,217 +138,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )

 #endif

-
-// A data structure to assist in passing operands to additional threads.
-typedef struct thread_data
-{
-	l3int_t    func;
-	opid_t     family;
-	pack_t     schema_a;
-	pack_t     schema_b;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	cntl_t*    cntl;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
-} thread_data_t;
-
-// Entry point for additional threads
-void* bli_l3_thread_entry( void* data_void )
-{
-	thread_data_t* data     = data_void;
-
-	l3int_t        func     = data->func;
-	opid_t         family   = data->family;
-	pack_t         schema_a = data->schema_a;
-	pack_t         schema_b = data->schema_b;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	cntl_t*        cntl     = data->cntl;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
-
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
-
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	obj_t          a_t, b_t, c_t;
-	cntl_t*        cntl_use;
-	thrinfo_t*     thread;
-
-	// Alias thread-local copies of A, B, and C. These will be the objects
-	// we pass down the algorithmic function stack. Making thread-local
-	// alaises is highly recommended in case a thread needs to change any
-	// of the properties of an object without affecting other threads'
-	// objects.
-	bli_obj_alias_to( a, &a_t );
-	bli_obj_alias_to( b, &b_t );
-	bli_obj_alias_to( c, &c_t );
-
-	// Create a default control tree for the operation, if needed.
-	bli_l3_cntl_create_if( family, schema_a, schema_b,
-	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
-	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-	func
-	(
-	  alpha,
-	  &a_t,
-	  &b_t,
-	  beta,
-	  &c_t,
-	  cntx,
-	  rntm_p,
-	  cntl_use,
-	  thread
-	);
-
-	// Free the thread's local control tree.
-	bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-	// Free the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_free( rntm_p, thread );
-
-	return NULL;
-}
-
-void bli_l3_thread_decorator
-     (
-       l3int_t     func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm,
-       cntl_t*     cntl
-     )
-{
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
-	// Query the total number of threads from the context.
-	const dim_t n_threads = bli_rntm_num_threads( rntm );
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_membrk_rntm_set_membrk( rntm );
-
-	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-	// Allocate an array of pthread objects and auxiliary data structs to pass
-	// to the thread entry functions.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	thread_data_t* datas    = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
-
-	// NOTE: We must iterate backwards so that the chief thread (thread id 0)
-	// can spawn all other threads before proceeding with its own computation.
-	for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
-	{
-		// Set up thread data for additional threads (beyond thread 0).
-		datas[tid].func     = func;
-		datas[tid].family   = family;
-		datas[tid].schema_a = schema_a;
-		datas[tid].schema_b = schema_b;
-		datas[tid].alpha    = alpha;
-		datas[tid].a        = a;
-		datas[tid].b        = b;
-		datas[tid].beta     = beta;
-		datas[tid].c        = c;
-		datas[tid].cntx     = cntx;
-		datas[tid].rntm     = rntm;
-		datas[tid].cntl     = cntl;
-		datas[tid].tid      = tid;
-		datas[tid].gl_comm  = gl_comm;
-		datas[tid].array    = array;
-
-		// Spawn additional threads for ids greater than 1.
-		if ( tid != 0 )
-			bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
-		else
-			bli_l3_thread_entry( ( void* )(&datas[0]) );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
-	// Thread 0 waits for additional threads to finish.
-	for ( dim_t tid = 1; tid < n_threads; tid++ )
-	{
-		bli_pthread_join( pthreads[tid], NULL );
-	}
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( pthreads );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_l3_thread_decorator().pth: " );
-	#endif
-	bli_free_intl( datas );
-}
-
 #endif

--- a/frame/thread/bli_thrcomm_single.c
+++ b/frame/thread/bli_thrcomm_single.c
@@ -84,119 +84,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
 	return;
 }

-// Define a dummy function bli_l3_thread_entry(), which is needed in the
-// pthreads version, so that when building Windows DLLs (with OpenMP enabled
-// or no multithreading) we don't risk having an unresolved symbol.
-void* bli_l3_thread_entry( void* data_void ) { return NULL; }
-
-void bli_l3_thread_decorator
-     (
-       l3int_t     func,
-       opid_t      family,
-       obj_t*      alpha,
-       obj_t*      a,
-       obj_t*      b,
-       obj_t*      beta,
-       obj_t*      c,
-       cntx_t*     cntx,
-       rntm_t*     rntm,
-       cntl_t*     cntl
-     )
-{
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
-	// For sequential execution, we use only one thread.
-	const dim_t n_threads = 1;
-
-	// NOTE: The sba was initialized in bli_init().
-
-	// Check out an array_t from the small block allocator. This is done
-	// with an internal lock to ensure only one application thread accesses
-	// the sba at a time. bli_sba_checkout_array() will also automatically
-	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we can create the global comm below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_membrk_rntm_set_membrk( rntm );
-
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
-
-
-	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
-
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
-		// need to alias objects for A, B, and C since they were already aliased
-		// in bli_*_front(). However, we may add aliasing here in the future so
-		// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
-		// consistently providing local aliases, we can then eliminate aliasing
-		// elsewhere.
-
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       a, b, c, rntm_p, cntl, &cntl_use );
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
-
-		func
-		(
-		  alpha,
-		  a,
-		  b,
-		  beta,
-		  c,
-		  cntx,
-		  rntm_p,
-		  cntl_use,
-		  thread
-		);
-
-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
-	}
-
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
-	// Check the array_t back into the small block allocator. Similar to the
-	// check-out, this is done using a lock embedded within the sba to ensure
-	// mutual exclusion.
-	bli_sba_checkin_array( array );
-}
-
 #endif

--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -39,8 +39,12 @@ thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
 thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
 thrcomm_t BLIS_SINGLE_COMM           = {};

-// The global rntm_t structure, which holds the global thread settings.
-static rntm_t global_rntm;
+// The global rntm_t structure. (The definition resides in bli_rntm.c.)
+extern rntm_t global_rntm;
+
+// A mutex to allow synchronous access to global_rntm. (The definition
+// resides in bli_rntm.c.)
+extern bli_pthread_mutex_t global_rntm_mutex;

 // -----------------------------------------------------------------------------

@@ -1198,63 +1202,6 @@ dim_t bli_ipow( dim_t base, dim_t power )

    return p;
 }
-// -----------------------------------------------------------------------------
-
-dim_t bli_thread_get_env( const char* env, dim_t fallback )
-{
-	dim_t r_val;
-	char* str;
-
-	// Query the environment variable and store the result in str.
-	str = getenv( env );
-
-	// Set the return value based on the string obtained from getenv().
-	if ( str != NULL )
-	{
-		// If there was no error, convert the string to an integer and
-		// prepare to return that integer.
-		r_val = strtol( str, NULL, 10 );
-	}
-	else
-	{
-		// If there was an error, use the "fallback" as the return value.
-		r_val = fallback;
-	}
-
-	return r_val;
-}
-
-#if 0
-void bli_thread_set_env( const char* env, dim_t value )
-{
-	dim_t       r_val;
-	char        value_str[32];
-	const char* fs_32 = "%u";
-	const char* fs_64 = "%lu";
-
-	// Convert the string to an integer, but vary the format specifier
-	// depending on the integer type size.
-	if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
-	else                                      sprintf( value_str, fs_64, value );
-
-	// Set the environment variable using the string we just wrote to via
-	// sprintf(). (The 'TRUE' argument means we want to overwrite the current
-	// value if the environment variable already exists.)
-	r_val = bli_setenv( env, value_str, TRUE );
-
-	// Check the return value in case something went horribly wrong.
-	if ( r_val == -1 )
-	{
-		char err_str[128];
-
-		// Query the human-readable error string corresponding to errno.
-		strerror_r( errno, err_str, 128 );
-
-		// Print the error message.
-		bli_print_msg( err_str, __FILE__, __LINE__ );
-	}
-}
-#endif

 // -----------------------------------------------------------------------------

@@ -1308,9 +1255,6 @@ dim_t bli_thread_get_num_threads( void )

 // ----------------------------------------------------------------------------

-// A mutex to allow synchronous access to global_rntm.
-static bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
-
 void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
 {
 	// We must ensure that global_rntm has been initialized.
@@ -1341,22 +1285,6 @@ void bli_thread_set_num_threads( dim_t n_threads )

 // ----------------------------------------------------------------------------

-void bli_thread_init_rntm( rntm_t* rntm )
-{
-	// We must ensure that global_rntm has been initialized.
-	bli_init_once();
-
-	// Acquire the mutex protecting global_rntm.
-	bli_pthread_mutex_lock( &global_rntm_mutex );
-
-	*rntm = global_rntm;
-
-	// Release the mutex protecting global_rntm.
-	bli_pthread_mutex_unlock( &global_rntm_mutex );
-}
-
-// ----------------------------------------------------------------------------
-
 void bli_thread_init_rntm_from_env
     (
       rntm_t* rntm
@@ -1373,19 +1301,19 @@ void bli_thread_init_rntm_from_env
 #ifdef BLIS_ENABLE_MULTITHREADING

 	// Try to read BLIS_NUM_THREADS first.
-	nt = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
+	nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );

 	// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
 	if ( nt == -1 )
-		nt = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
+		nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );

 	// Read the environment variables for the number of threads (ways
 	// of parallelism) for each individual loop.
-	jc = bli_thread_get_env( "BLIS_JC_NT", -1 );
-	pc = bli_thread_get_env( "BLIS_PC_NT", -1 );
-	ic = bli_thread_get_env( "BLIS_IC_NT", -1 );
-	jr = bli_thread_get_env( "BLIS_JR_NT", -1 );
-	ir = bli_thread_get_env( "BLIS_IR_NT", -1 );
+	jc = bli_env_get_var( "BLIS_JC_NT", -1 );
+	pc = bli_env_get_var( "BLIS_PC_NT", -1 );
+	ic = bli_env_get_var( "BLIS_IC_NT", -1 );
+	jr = bli_env_get_var( "BLIS_JR_NT", -1 );
+	ir = bli_env_get_var( "BLIS_IR_NT", -1 );

 	// If any BLIS_*_NT environment variable was set, then we ignore the
 	// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -49,6 +49,14 @@
 #include "bli_packm_thrinfo.h"
 #include "bli_l3_thrinfo.h"

+// Include the level-3 thread decorator and related definitions and prototypes
+// for the conventional code path.
+#include "bli_l3_decor.h"
+
+// Include the level-3 thread decorator and related definitions and prototypes
+// for the sup code path.
+#include "bli_l3_sup_decor.h"
+
 // Initialization-related prototypes.
 void bli_thread_init( void );
 void bli_thread_finalize( void );
@@ -143,37 +151,6 @@ siz_t bli_thread_range_weighted_sub
       dim_t*     restrict j_end_thr
     );

-
-
-// Level-3 internal function type
-typedef void (*l3int_t)
-     (
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-// Level-3 thread decorator prototype
-void bli_l3_thread_decorator
-     (
-       l3int_t func,
-       opid_t  family,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
-
 // -----------------------------------------------------------------------------

 // Factorization and partitioning prototypes
@@ -205,9 +182,6 @@ dim_t bli_ipow( dim_t base, dim_t power );

 // -----------------------------------------------------------------------------

-BLIS_EXPORT_BLIS dim_t bli_thread_get_env( const char* env, dim_t fallback );
-//void  bli_thread_set_env( const char* env, dim_t value );
-
 BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
 BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
 BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -54,6 +54,12 @@
   Therefore, this (r)ow-preferential microkernel is well-suited for
   a dot-product-based accumulation that performs vector loads from
   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
 */

 // Prototype reference microkernels.
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -54,6 +54,12 @@
   Therefore, this (r)ow-preferential microkernel is well-suited for
   a dot-product-based accumulation that performs vector loads from
   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
 */

 // Prototype reference microkernels.
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -156,12 +156,44 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 			  beta, cij, rs_c0, cs_c0, data, cntx
 			);
 #else
-			bli_dgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
+			dim_t ps_a0 = bli_auxinfo_ps_a( data );
+
+			if ( ps_a0 == 6 * rs_a0 )
+			{
+				// Since A is not packed, we can use one gemv.
+				bli_dgemv_ex
+				(
+				  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+				  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+				  beta, cij, rs_c0, cntx, NULL
+				);
+			}
+			else
+			{
+				const dim_t mr = 6;
+
+				// Since A is packed into row panels, we must use a loop over
+				// gemv.
+				dim_t m_iter = ( m0 + mr - 1 ) / mr;
+				dim_t m_left =   m0            % mr;
+
+				double* restrict ai_ii  = ai;
+				double* restrict cij_ii = cij;
+
+				for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+				{
+					dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+					                 ? mr : m_left );
+
+					bli_dgemv_ex
+					(
+					  BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+					  alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+					  beta, cij_ii, rs_c0, cntx, NULL
+					);
+					cij_ii += mr*rs_c0; ai_ii += ps_a0;
+				}
+			}
 #endif
 		}
 		return;
@@ -185,6 +217,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of A and convert it to units of bytes.
+	uint64_t ps_a   = bli_auxinfo_ps_a( data );
+	uint64_t ps_a8  = ps_a * sizeof( double );
+
 	if ( m_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -819,8 +855,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c

-	lea(mem(r14, r8,  4), r14)         //
-	lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	//lea(mem(r14, r8,  4), r14)         //
+	//lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	mov(var(ps_a8), rax)               // load ps_a8
+	lea(mem(r14, rax, 1), r14)         // a_ii = r14 += ps_a8

 	dec(r11)                           // ii -= 1;
 	jne(.DLOOP6X8I)                    // iterate again if ii != 0.
@@ -841,6 +879,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
      [a]      "m" (a),
      [rs_a]   "m" (rs_a),
      [cs_a]   "m" (cs_a),
+      [ps_a8]  "m" (ps_a8),
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
@@ -870,7 +909,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 		const dim_t      i_edge = m0 - ( dim_t )m_left;

 		double* restrict cij = c + i_edge*rs_c;
-		double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
+		double* restrict ai  = a + m_iter * ps_a;
 		double* restrict bj  = b;

 #if 0
@@ -979,6 +1020,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of A and convert it to units of bytes.
+	uint64_t ps_a   = bli_auxinfo_ps_a( data );
+	uint64_t ps_a8  = ps_a * sizeof( double );
+
 	if ( m_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -1591,8 +1636,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c

-	lea(mem(r14, r8,  4), r14)         //
-	lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	//lea(mem(r14, r8,  4), r14)         //
+	//lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	mov(var(ps_a8), rax)               // load ps_a8
+	lea(mem(r14, rax, 1), r14)         // a_ii = r14 += ps_a8

 	dec(r11)                           // ii -= 1;
 	jne(.DLOOP6X8I)                    // iterate again if ii != 0.
@@ -1613,6 +1660,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
      [a]      "m" (a),
      [rs_a]   "m" (rs_a),
      [cs_a]   "m" (cs_a),
+      [ps_a8]  "m" (ps_a8),
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
@@ -1642,7 +1690,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 		const dim_t      i_edge = m0 - ( dim_t )m_left;

 		double* restrict cij = c + i_edge*rs_c;
-		double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
+		double* restrict ai  = a + m_iter * ps_a;
 		double* restrict bj  = b;

 #if 0
@@ -1751,6 +1801,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of A and convert it to units of bytes.
+	uint64_t ps_a   = bli_auxinfo_ps_a( data );
+	uint64_t ps_a8  = ps_a * sizeof( double );
+
 	if ( m_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -2241,8 +2295,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c

-	lea(mem(r14, r8,  4), r14)         //
-	lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	//lea(mem(r14, r8,  4), r14)         //
+	//lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	mov(var(ps_a8), rax)               // load ps_a8
+	lea(mem(r14, rax, 1), r14)         // a_ii = r14 += ps_a8

 	dec(r11)                           // ii -= 1;
 	jne(.DLOOP6X4I)                    // iterate again if ii != 0.
@@ -2263,6 +2319,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
      [a]      "m" (a),
      [rs_a]   "m" (rs_a),
      [cs_a]   "m" (cs_a),
+      [ps_a8]  "m" (ps_a8),
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
@@ -2292,7 +2349,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 		const dim_t      i_edge = m0 - ( dim_t )m_left;

 		double* restrict cij = c + i_edge*rs_c;
-		double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
+		double* restrict ai  = a + m_iter * ps_a;
 		double* restrict bj  = b;

 #if 0
@@ -2401,6 +2460,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of A and convert it to units of bytes.
+	uint64_t ps_a   = bli_auxinfo_ps_a( data );
+	uint64_t ps_a8  = ps_a * sizeof( double );
+
 	if ( m_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -2867,8 +2930,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c

-	lea(mem(r14, r8,  4), r14)         //
-	lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	//lea(mem(r14, r8,  4), r14)         //
+	//lea(mem(r14, r8,  2), r14)         // a_ii = r14 += 6*rs_a
+	mov(var(ps_a8), rax)               // load ps_a8
+	lea(mem(r14, rax, 1), r14)         // a_ii = r14 += ps_a8

 	dec(r11)                           // ii -= 1;
 	jne(.DLOOP6X2I)                    // iterate again if ii != 0.
@@ -2889,6 +2954,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
      [a]      "m" (a),
      [rs_a]   "m" (rs_a),
      [cs_a]   "m" (cs_a),
+      [ps_a8]  "m" (ps_a8),
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
@@ -2918,7 +2984,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 		const dim_t      i_edge = m0 - ( dim_t )m_left;

 		double* restrict cij = c + i_edge*rs_c;
-		double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + i_edge*rs_a;
+		//double* restrict ai  = a + ( i_edge / 6 ) * ps_a;
+		double* restrict ai  = a + m_iter * ps_a;
 		double* restrict bj  = b;

 #if 0
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -195,6 +195,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -853,6 +857,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -880,7 +885,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
@@ -977,6 +984,10 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -1596,6 +1607,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -1623,7 +1635,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
@@ -1720,6 +1734,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -2248,6 +2266,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -2275,7 +2294,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
@@ -2363,6 +2384,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -2921,6 +2946,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -2948,7 +2974,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
@@ -3036,6 +3064,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -3475,6 +3507,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -3502,7 +3535,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
@@ -3590,6 +3625,10 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;

+	// Query the panel stride of B and convert it to units of bytes.
+	uint64_t ps_b   = bli_auxinfo_ps_b( data );
+	uint64_t ps_b8  = ps_b * sizeof( double );
+
 	if ( n_iter == 0 ) goto consider_edge_cases;

 	// -------------------------------------------------------------------------
@@ -3993,6 +4032,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
      [b]      "m" (b),
      [rs_b]   "m" (rs_b),
      [cs_b]   "m" (cs_b),
+      [ps_b8]  "m" (ps_b8),
      [alpha]  "m" (alpha),
      [beta]   "m" (beta),
      [c]      "m" (c),
@@ -4020,7 +4060,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n

 		double* restrict cij = c + j_edge*cs_c;
 		double* restrict ai  = a;
-		double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + j_edge*cs_b;
+		//double* restrict bj  = b + ( j_edge / 8 ) * ps_b;
+		double* restrict bj  = b + n_iter * ps_b;

 		if ( 6 <= n_left )
 		{
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -829,12 +829,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
 	dim_t  m = 1000, n = 1000, k = 1000;

-	bli_thread_init_rntm( &gemm   );
-	bli_thread_init_rntm( &herk   );
-	bli_thread_init_rntm( &trmm_l );
-	bli_thread_init_rntm( &trmm_r );
-	bli_thread_init_rntm( &trsm_l );
-	bli_thread_init_rntm( &trsm_r );
+	bli_rntm_init_from_global( &gemm   );
+	bli_rntm_init_from_global( &herk   );
+	bli_rntm_init_from_global( &trmm_l );
+	bli_rntm_init_from_global( &trmm_r );
+	bli_rntm_init_from_global( &trsm_l );
+	bli_rntm_init_from_global( &trsm_r );

 	bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT,  m, n, k, &gemm );
 	bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT,  m, n, k, &herk );