Replaced broken ref99 sandbox w/ simpler version.

Details: - The 'ref99' sandbox was broken by multiple refactorings and internal API changes over the last two years. Rather than try to fix it, I've replaced it with a much simpler version based on var2 of gemmsup. Why not fix the previous implementation? It occurred to me that the old implementation was trying to be a lightly simplified duplication of what exists in the framework. Duplication aside, this sandbox would have worked fine if it had been completely independent of the framework code. The problem was that it was only partially independent, with many function calls calling a function in BLIS rather than a duplicated/simplified version within the sandbox. (And the reason I didn't make it fully independent to begin with was that it seemed unnecessarily duplicative at the time.) Maintaining two versions of the same implementation is problematic for obvious reasons, especially when it wasn't even done properly to begin with. This explains the reimplementation in this commit. The only catch is that the newer implementation is single-threaded only and does not perform any packing on either input matrix (A or B). Basically, it's only meant to be a simple placeholder that shows how you could plug in your own implementation. Thanks to Francisco Igual for reporting this brokenness. - Updated the three reference gemmsup kernels (defined in ref_kernels/3/bli_gemmsup_ref.c) so that they properly handle conjugation of conja and/or conjb. The general storage kernel, which is currently identical to the column-storage kernel, is used in the new ref99 sandbox to provide basic support for all datatypes (including scomplex and dcomplex). - Minor updates to docs/Sandboxes.md, including adding the threading and packing limitations to the Caveats section. - Fixed a comment typo in bli_l3_sup_var1n2m.c (upon which the new sandbox implementation is based).
2026-04-20 07:38:53 +00:00 · 2020-07-20 19:21:07 -05:00
parent 004946ed06
commit 1d8d5cd9cf
33 changed files with 948 additions and 110 deletions
--- a/docs/Sandboxes.md
+++ b/docs/Sandboxes.md
@@ -52,10 +52,8 @@ configure:   sandbox/ref99
 And when you build BLIS, the last files to be compiled will be the source
 code in the specified sandbox:
 ```
-Compiling obj/haswell/sandbox/ref99/blx_gemm_front.o ('haswell' CFLAGS for sandboxes)
-Compiling obj/haswell/sandbox/ref99/blx_gemm_int.o ('haswell' CFLAGS for sandboxes)
-Compiling obj/haswell/sandbox/ref99/base/blx_blksz.o ('haswell' CFLAGS for sandboxes)
-Compiling obj/haswell/sandbox/ref99/cntl/blx_gemm_cntl.o ('haswell' CFLAGS for sandboxes)
+Compiling obj/haswell/sandbox/ref99/blx_gemm_ref_var2.o ('haswell' CFLAGS for sandboxes)
+Compiling obj/haswell/sandbox/ref99/oapi/bli_gemmnat.o ('haswell' CFLAGS for sandboxes)
 ...
 ```
 That's it! After the BLIS library is built, it will contain your chosen
@@ -197,6 +195,12 @@ there's no way for it to confirm at runtime that an implementation was written
 to support mixing datatypes. Note that even the `ref99` sandbox included with
 BLIS does not support mixed-datatype computation.

+* **Multithreading in ref99.** The current reference sandbox, `ref99`, does not
+currently implement multithreading.
+
+* **Packing matrices in ref99.** The current reference sandbox, `ref99`, does not
+currently implement packing of matrices A or B.
+
 ## Conclusion

 If you encounter any problems, or are really bummed-out that `gemm` is the
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -1144,7 +1144,7 @@ void PASTEMAC(ch,varname) \
 			  thread_pb  \
 			); \
 \
-			/* Alias a_use so that it's clear this is our current block of
+			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
 			ctype* restrict b_pc_use = b_use; \
 \
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -60,43 +60,178 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	/* NOTE: This microkernel can actually handle arbitrarily large
       values of m, n, and k. */ \
 \
-	/* Traverse c by rows. */ \
-	for ( dim_t i = 0; i < m; ++i ) \
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
 	{ \
-		ctype* restrict ci = &c[ i*rs_c ]; \
-		ctype* restrict ai = &a[ i*rs_a ]; \
-\
-		for ( dim_t j = 0; j < n; ++j ) \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
 		{ \
-			ctype* restrict cij = &ci[ j*cs_c ]; \
-			ctype* restrict bj  = &b [ j*cs_b ]; \
-			ctype           ab; \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
 \
-			PASTEMAC(ch,set0s)( ab ); \
-\
-			/* Perform a dot product to update the (i,j) element of c. */ \
-			for ( dim_t l = 0; l < k; ++l ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
 \
-				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
 \
-			/* If beta is one, add ab into c. If beta is zero, overwrite c
-			   with the result in ab. Otherwise, scale by beta and accumulate
-			   ab to c. */ \
-			if ( PASTEMAC(ch,eq1)( *beta ) ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
-			else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
-			else \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by rows. */ \
+		for ( dim_t i = 0; i < m; ++i ) \
+		{ \
+			ctype* restrict ci = &c[ i*rs_c ]; \
+			ctype* restrict ai = &a[ i*rs_a ]; \
+\
+			for ( dim_t j = 0; j < n; ++j ) \
 			{ \
-				PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				ctype* restrict cij = &ci[ j*cs_c ]; \
+				ctype* restrict bj  = &b [ j*cs_b ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
 		} \
 	} \
@@ -130,43 +265,178 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	/* NOTE: This microkernel can actually handle arbitrarily large
       values of m, n, and k. */ \
 \
-	/* Traverse c by columns. */ \
-	for ( dim_t j = 0; j < n; ++j ) \
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
 	{ \
-		ctype* restrict cj = &c[ j*cs_c ]; \
-		ctype* restrict bj = &b[ j*cs_b ]; \
-\
-		for ( dim_t i = 0; i < m; ++i ) \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			ctype* restrict cij = &cj[ i*rs_c ]; \
-			ctype* restrict ai  = &a [ i*rs_a ]; \
-			ctype           ab; \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
 \
-			PASTEMAC(ch,set0s)( ab ); \
-\
-			/* Perform a dot product to update the (i,j) element of c. */ \
-			for ( dim_t l = 0; l < k; ++l ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				ctype* restrict aij = &ai[ l*cs_a ]; \
-				ctype* restrict bij = &bj[ l*rs_b ]; \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
 \
-				PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
+		} \
+	} \
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
 \
-			/* If beta is one, add ab into c. If beta is zero, overwrite c
-			   with the result in ab. Otherwise, scale by beta and accumulate
-			   ab to c. */ \
-			if ( PASTEMAC(ch,eq1)( *beta ) ) \
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
-			else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		} \
+	} \
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
+				} \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
-			else \
+		} \
+	} \
+	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
+	{ \
+		/* Traverse c by columns. */ \
+		for ( dim_t j = 0; j < n; ++j ) \
+		{ \
+			ctype* restrict cj = &c[ j*cs_c ]; \
+			ctype* restrict bj = &b[ j*cs_b ]; \
+\
+			for ( dim_t i = 0; i < m; ++i ) \
 			{ \
-				PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				ctype* restrict cij = &cj[ i*rs_c ]; \
+				ctype* restrict ai  = &a [ i*rs_a ]; \
+				ctype           ab; \
+\
+				PASTEMAC(ch,set0s)( ab ); \
+\
+				/* Perform a dot product to update the (i,j) element of c. */ \
+				for ( dim_t l = 0; l < k; ++l ) \
+				{ \
+					ctype* restrict aij = &ai[ l*cs_a ]; \
+					ctype* restrict bij = &bj[ l*rs_b ]; \
+\
+					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
+				} \
+\
+				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
+				PASTEMAC(ch,conjs)( ab ); \
+\
+				/* If beta is one, add ab into c. If beta is zero, overwrite c
+				   with the result in ab. Otherwise, scale by beta and accumulate
+				   ab to c. */ \
+				if ( PASTEMAC(ch,eq1)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
+				} \
+				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
+				} \
+				else \
+				{ \
+					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
+				} \
 			} \
 		} \
 	} \
--- a/sandbox/ref99/oapi/bli_gemmnat.c
+++ b/sandbox/ref99/oapi/bli_gemmnat.c
@@ -56,14 +56,19 @@ void bli_gemmnat
 {
    bli_init_once();

-    // Obtain a valid native context from the gks if necessary.
+    // Obtain a valid (native) context from the gks if necessary.
    if ( cntx == NULL ) cntx = bli_gks_query_cntx();

-	// Initialize a local runtime object if necessary.
+	// Initialize a local runtime with global settings if necessary. Note
+    // that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } 

    // Invoke the operation's front end.
-    blx_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+    //blx_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+    blx_gemm_ref_var2( BLIS_NO_TRANSPOSE,
+	                   alpha, a, b, beta, c,
+	                   BLIS_XXX, cntx, rntm, NULL );
 }

--- a/sandbox/ref99/include/bli_sandbox.h
+++ b/sandbox/ref99/include/bli_sandbox.h
--- a/sandbox/ref99/include/blix.h
+++ b/sandbox/ref99/include/blix.h
@@ -39,7 +39,7 @@
 // we #include any headers that would define prototypes or types that are
 // needed by the ref99 sandbox source code.

-#include "blx_gemm.h"
+#include "blx_gemm_ref_var2.h"

 #endif

--- a/sandbox/ref99/blx_gemm_ref_var2.c
+++ b/sandbox/ref99/blx_gemm_ref_var2.c
@@ -0,0 +1,361 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+#define FUNCPTR_T gemmsup_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       bool_t           packa,
+       bool_t           packb,
+       conj_t           conja,
+       conj_t           conjb,
+       dim_t            m,
+       dim_t            n,
+       dim_t            k,
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       stor3_t          eff_id,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
+     );
+
+//
+// -- var2 ---------------------------------------------------------------------
+//
+
+static FUNCPTR_T GENARRAY(ftypes_var2,gemm_ref_var2);
+
+void blx_gemm_ref_var2
+     (
+       trans_t trans,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const num_t    dt        = bli_obj_dt( c );
+
+	const bool_t   packa     = bli_rntm_pack_a( rntm );
+	const bool_t   packb     = bli_rntm_pack_b( rntm );
+
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
+
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	      dim_t    k;
+
+	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	      inc_t    rs_a;
+	      inc_t    cs_a;
+
+	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	      inc_t    rs_b;
+	      inc_t    cs_b;
+
+	if ( bli_obj_has_notrans( a ) )
+	{
+		k     = bli_obj_width( a );
+
+		rs_a  = bli_obj_row_stride( a );
+		cs_a  = bli_obj_col_stride( a );
+	}
+	else // if ( bli_obj_has_trans( a ) )
+	{
+		// Assign the variables with an implicit transposition.
+		k     = bli_obj_length( a );
+
+		rs_a  = bli_obj_col_stride( a );
+		cs_a  = bli_obj_row_stride( a );
+	}
+
+	if ( bli_obj_has_notrans( b ) )
+	{
+		rs_b  = bli_obj_row_stride( b );
+		cs_b  = bli_obj_col_stride( b );
+	}
+	else // if ( bli_obj_has_trans( b ) )
+	{
+		// Assign the variables with an implicit transposition.
+		rs_b  = bli_obj_col_stride( b );
+		cs_b  = bli_obj_row_stride( b );
+	}
+
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
+
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	FUNCPTR_T f = ftypes_var2[dt];
+
+	if ( bli_is_notrans( trans ) )
+	{
+		// Invoke the function.
+		f
+		(
+		  packa,
+		  packb,
+		  conja,
+		  conjb,
+		  m,
+		  n,
+		  k,
+		  buf_alpha,
+		  buf_a, rs_a, cs_a,
+		  buf_b, rs_b, cs_b,
+		  buf_beta,
+		  buf_c, rs_c, cs_c,
+		  eff_id,
+		  cntx,
+		  rntm,
+		  thread
+		);
+	}
+	else
+	{
+		bli_abort();
+	}
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       bool_t           packa, \
+       bool_t           packb, \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       stor3_t          stor_id, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	/* If m or n is zero, return immediately. */ \
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* If k < 1 or alpha is zero, scale by beta and return. */ \
+	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
+	{ \
+		PASTEMAC(ch,scalm) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  0, \
+		  BLIS_NONUNIT_DIAG, \
+		  BLIS_DENSE, \
+		  m, n, \
+		  beta, \
+		  c, rs_c, cs_c \
+		); \
+		return; \
+	} \
+\
+	/* Query the context for various blocksizes. NOTE: We query the
+	   regular blocksizes since the sup blocksizes are not guaranteed
+	   to have default values. */ \
+	const dim_t NR  = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+	const dim_t MR  = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t NC  = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
+	const dim_t MC  = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
+	const dim_t KC  = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
+\
+	/* Compute partitioning step values for each matrix of each loop. */ \
+	const inc_t jcstep_c = cs_c; \
+	const inc_t jcstep_b = cs_b; \
+\
+	const inc_t pcstep_a = cs_a; \
+	const inc_t pcstep_b = rs_b; \
+\
+	const inc_t icstep_c = rs_c; \
+	const inc_t icstep_a = rs_a; \
+\
+	const inc_t jrstep_c = cs_c * NR; \
+	const inc_t jrstep_b = cs_b * NR; \
+\
+	const inc_t irstep_c = rs_c * MR; \
+	const inc_t irstep_a = rs_a * MR; \
+\
+	/* Query the context for the sup microkernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemmsup_ker_ft) \
+               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+\
+	ctype* restrict a_00       = a; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+\
+	/* Make local copies of beta and one scalars to prevent any unnecessary
+	   sharing of cache lines between the cores' caches. */ \
+	ctype           beta_local = *beta_cast; \
+	ctype           one_local  = *PASTEMAC(ch,1); \
+\
+	auxinfo_t       aux; \
+\
+	/* Compute number of primary and leftover components of the JC loop. */ \
+	/*const dim_t jc_iter = ( n + NC - 1 ) / NC;*/ \
+	const dim_t jc_left =   n % NC; \
+\
+	/* Loop over the n dimension (NC rows/columns at a time). */ \
+	for ( dim_t jj = 0; jj < n; jj += NC ) \
+	{ \
+		/* Calculate the thread's current JC block dimension. */ \
+		const dim_t nc_cur = ( NC <= n - jj ? NC : jc_left ); \
+\
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+\
+		/* Compute number of primary and leftover components of the PC loop. */ \
+		/*const dim_t pc_iter = ( k + KC - 1 ) / KC;*/ \
+		const dim_t pc_left =   k % KC; \
+\
+		/* Loop over the k dimension (KC rows/columns at a time). */ \
+		for ( dim_t pp = 0; pp < k; pp += KC ) \
+		{ \
+			/* Calculate the thread's current PC block dimension. */ \
+			const dim_t kc_cur = ( KC <= k - pp ? KC : pc_left ); \
+\
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+\
+			/* Only apply beta to the first iteration of the pc loop. */ \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+\
+			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
+\
+			/* Compute number of primary and leftover components of the IC loop. */ \
+			/*const dim_t ic_iter = ( m + MC - 1 ) / MC;*/ \
+			const dim_t ic_left =   m % MC; \
+\
+			/* Loop over the m dimension (MC rows at a time). */ \
+			for ( dim_t ii = 0; ii < m; ii += MC ) \
+			{ \
+				/* Calculate the thread's current IC block dimension. */ \
+				const dim_t mc_cur = ( MC <= m - ii ? MC : ic_left ); \
+\
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+\
+				/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
+\
+				/* Compute number of primary and leftover components of the JR loop. */ \
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
+				dim_t jr_left =   nc_cur % NR; \
+\
+				/* Loop over the n dimension (NR columns at a time). */ \
+				for ( dim_t j = 0; j < jr_iter; j += 1 ) \
+				{ \
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
+\
+					ctype* restrict b_jr = b_pc + j * jrstep_b; \
+					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+\
+					/* Compute number of primary and leftover components of the IR loop. */ \
+					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
+					const dim_t ir_left =   mc_cur % MR; \
+\
+					/* Loop over the m dimension (MR columns at a time). */ \
+					for ( dim_t i = 0; i < ir_iter; i += 1 ) \
+					{ \
+						const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
+\
+						ctype* restrict a_ir = a_ic + i * irstep_a; \
+						ctype* restrict c_ir = c_jr + i * irstep_c; \
+\
+						/*
+						bli_auxinfo_set_next_a( a2, &aux ); \
+						bli_auxinfo_set_next_b( b2, &aux ); \
+						*/ \
+\
+						/* Invoke the kernel. */ \
+						gemmsup_ker \
+						( \
+						  conja, \
+						  conjb, \
+						  mr_cur, \
+						  nr_cur, \
+						  kc_cur, \
+						  alpha_cast, \
+						  a_ir, rs_a, cs_a, \
+						  b_jr, rs_b, cs_b, \
+						  beta_use, \
+						  c_ir, rs_c, cs_c, \
+						  &aux, \
+						  cntx  \
+						); \
+					} \
+				} \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ref_var2 )
+
--- a/sandbox/ref99/blx_gemm_ref_var2.h
+++ b/sandbox/ref99/blx_gemm_ref_var2.h
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void blx_gemm_ref_var2
+     (
+       trans_t trans,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       bool_t           packa, \
+       bool_t           packb, \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       stor3_t          stor_id, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
+     );
+
+INSERT_GENTPROT_BASIC0( gemm_ref_var2 )
+
--- a/sandbox/ref99/old/base/blx_blksz.c
+++ b/sandbox/ref99/old/base/blx_blksz.c
--- a/sandbox/ref99/old/base/blx_blksz.h
+++ b/sandbox/ref99/old/base/blx_blksz.h
--- a/sandbox/ref99/old/blx_gemm.h
+++ b/sandbox/ref99/old/blx_gemm.h
--- a/sandbox/ref99/old/blx_gemm_front.c
+++ b/sandbox/ref99/old/blx_gemm_front.c
--- a/sandbox/ref99/old/blx_gemm_front.h
+++ b/sandbox/ref99/old/blx_gemm_front.h
--- a/sandbox/ref99/old/blx_gemm_int.c
+++ b/sandbox/ref99/old/blx_gemm_int.c
--- a/sandbox/ref99/old/blx_gemm_int.h
+++ b/sandbox/ref99/old/blx_gemm_int.h
--- a/sandbox/ref99/old/cntl/blx_gemm_cntl.c
+++ b/sandbox/ref99/old/cntl/blx_gemm_cntl.c
@@ -38,21 +38,23 @@

 cntl_t* blx_gemm_cntl_create
     (
-       opid_t family,
-       pack_t schema_a,
-       pack_t schema_b
+       rntm_t* rntm,
+       opid_t  family,
+       pack_t  schema_a,
+       pack_t  schema_b
     )
 {
-	return blx_gemmbp_cntl_create( family, schema_a, schema_b );
+	return blx_gemmbp_cntl_create( rntm, family, schema_a, schema_b );
 }

 // -----------------------------------------------------------------------------

 cntl_t* blx_gemmbp_cntl_create
     (
-       opid_t family,
-       pack_t schema_a,
-       pack_t schema_b
+       rntm_t* rntm,
+       opid_t  family,
+       pack_t  schema_a,
+       pack_t  schema_b
     )
 {
 	void_fp macro_kernel_fp;
@@ -67,6 +69,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
 	(
+	  rntm,    // the thread's runtime structure
 	  family,  // the operation family
 	  BLIS_MR, // needed for bli_thrinfo_rgrow()
 	  NULL,    // variant function pointer not used
@@ -75,6 +78,7 @@ cntl_t* blx_gemmbp_cntl_create

 	cntl_t* gemm_cntl_bp_bu = blx_gemm_cntl_create_node
 	(
+	  rntm,    // the thread's runtime structure
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
 	  macro_kernel_fp,
@@ -84,6 +88,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
 	(
+	  rntm,
 	  blx_gemm_packa,  // pack the left-hand operand
 	  packa_fp,
 	  BLIS_MR,
@@ -99,6 +104,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create a node for partitioning the m dimension by MC.
 	cntl_t* gemm_cntl_op_bp = blx_gemm_cntl_create_node
 	(
+	  rntm,
 	  family,
 	  BLIS_MC,
 	  blx_gemm_blk_var1,
@@ -108,6 +114,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create a node for packing matrix B.
 	cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
 	(
+	  rntm,
 	  blx_gemm_packb,  // pack the right-hand operand
 	  packb_fp,
 	  BLIS_KR,
@@ -123,6 +130,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create a node for partitioning the k dimension by KC.
 	cntl_t* gemm_cntl_mm_op = blx_gemm_cntl_create_node
 	(
+	  rntm,
 	  family,
 	  BLIS_KC,
 	  blx_gemm_blk_var3,
@@ -132,6 +140,7 @@ cntl_t* blx_gemmbp_cntl_create
 	// Create a node for partitioning the n dimension by NC.
 	cntl_t* gemm_cntl_vl_mm = blx_gemm_cntl_create_node
 	(
+	  rntm,
 	  family,
 	  BLIS_NC,
 	  blx_gemm_blk_var2,
@@ -145,23 +154,25 @@ cntl_t* blx_gemmbp_cntl_create

 void blx_gemm_cntl_free
     (
-       cntl_t* cntl,
+	   rntm_t*    rntm,
+       cntl_t*    cntl,
       thrinfo_t* thread
     )
 {
-	bli_cntl_free( cntl, thread );
+	bli_cntl_free( rntm, cntl, thread );
 }

 // -----------------------------------------------------------------------------

 cntl_t* blx_gemm_cntl_create_node
     (
+       rntm_t* rntm,
       opid_t  family,
       bszid_t bszid,
       void_fp var_func,
       cntl_t* sub_node
     )
 {
-	return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
+	return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
 }

--- a/sandbox/ref99/old/cntl/blx_gemm_cntl.h
+++ b/sandbox/ref99/old/cntl/blx_gemm_cntl.h
@@ -34,25 +34,28 @@

 cntl_t* blx_gemm_cntl_create
     (
-       opid_t family,
-       pack_t schema_a,
-       pack_t schema_b
+       rntm_t* rntm,
+       opid_t  family,
+       pack_t  schema_a,
+       pack_t  schema_b
     );

 // -----------------------------------------------------------------------------

 cntl_t* blx_gemmbp_cntl_create
     (
-       opid_t family,
-       pack_t schema_a,
-       pack_t schema_b
+       rntm_t* rntm,
+       opid_t  family,
+       pack_t  schema_a,
+       pack_t  schema_b
     );

 // -----------------------------------------------------------------------------

 void blx_gemm_cntl_free
     (
-       cntl_t* cntl,
+       rntm_t*    rntm,
+       cntl_t*    cntl,
       thrinfo_t* thread
     );

@@ -60,6 +63,7 @@ void blx_gemm_cntl_free

 cntl_t* blx_gemm_cntl_create_node
     (
+       rntm_t* rntm,
       opid_t  family,
       bszid_t bszid,
       void_fp var_func,
--- a/sandbox/ref99/old/cntl/blx_l3_cntl_if.c
+++ b/sandbox/ref99/old/cntl/blx_l3_cntl_if.c
@@ -39,30 +39,16 @@
 void blx_l3_cntl_create_if
     (
       opid_t   family,
+       pack_t   schema_a,
+       pack_t   schema_b,
       obj_t*   a,
       obj_t*   b,
       obj_t*   c,
+       rntm_t*  rntm,
       cntl_t*  cntl_orig,
       cntl_t** cntl_use
     )
 {
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects. Notice that we do this even if the
-	// caller passed in a custom control tree; that's because we still need
-	// to reset the pack schema of a and b, which were modified by the
-	// operation's _front() function. However, in order for this to work,
-	// the level-3 thread entry function (or omp parallel region) must
-	// alias thread-local copies of objects a and b.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
 	// If the control tree pointer is NULL, we construct a default
 	// tree as a function of the operation family.
 	if ( cntl_orig == NULL )
@@ -74,7 +60,7 @@ void blx_l3_cntl_create_if
 		// If the user provided a control tree, create a copy and use it
 		// instead (so that threads can use its local tree as a place to
 		// cache things like pack mem_t entries).
-		*cntl_use = bli_cntl_copy( cntl_orig );
+		*cntl_use = bli_cntl_copy( rntm, cntl_orig );

 		// Recursively set the family fields of the newly copied control tree
 		// nodes.
@@ -82,13 +68,10 @@ void blx_l3_cntl_create_if
 	}
 }

-void blx_l3_cntl_free_if
+void blx_l3_cntl_free
     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl_orig,
-       cntl_t* cntl_use,
+       rntm_t     rntm,
+       cntl_t*    cntl_use,
       thrinfo_t* thread
     )
 {
@@ -96,13 +79,13 @@ void blx_l3_cntl_free_if
 	// been created, so we now must free it.
 	if ( cntl_orig == NULL )
 	{
-		blx_gemm_cntl_free( cntl_use, thread );
+		blx_gemm_cntl_free( rntm, cntl_use, thread );
 	}
 	else
 	{
 		// If the user provided a control tree, free the copy of it that
 		// was created.
-		bli_cntl_free( cntl_use, thread );
+		bli_cntl_free( rntm, cntl_use );
 	}
 }

--- a/sandbox/ref99/old/cntl/blx_l3_cntl_if.h
+++ b/sandbox/ref99/old/cntl/blx_l3_cntl_if.h
@@ -35,20 +35,19 @@
 void blx_l3_cntl_create_if
     (
       opid_t   family,
+       pack_t   schema_a,
+       pack_t   schema_b,
       obj_t*   a,
       obj_t*   b,
       obj_t*   c,
+       rntm_t*  rntm,
       cntl_t*  cntl_orig,
       cntl_t** cntl_use
     );

-void blx_l3_cntl_free_if
+void blx_l3_cntl_free
     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl_orig,
-       cntl_t* cntl_use,
+       rntm_t     rntm,
+       cntl_t*    cntl_use,
       thrinfo_t* thread
     );
-
--- a/sandbox/ref99/old/cntl/blx_packm_cntl.c
+++ b/sandbox/ref99/old/cntl/blx_packm_cntl.c
@@ -36,6 +36,7 @@

 cntl_t* blx_packm_cntl_create_node
     (
+	   rntm_t*   rntm,
       void_fp   var_func,
       void_fp   packm_var_func,
       bszid_t   bmid_m,
--- a/sandbox/ref99/old/cntl/blx_packm_cntl.h
+++ b/sandbox/ref99/old/cntl/blx_packm_cntl.h
@@ -34,6 +34,7 @@

 cntl_t* blx_packm_cntl_create_node
     (
+	   rntm_t*   rntm,
       void_fp   var_func,
       void_fp   packm_var_func,
       bszid_t   bmid_m,
--- a/sandbox/ref99/old/packm/blx_l3_packm.c
+++ b/sandbox/ref99/old/packm/blx_l3_packm.c
--- a/sandbox/ref99/old/packm/blx_l3_packm.h
+++ b/sandbox/ref99/old/packm/blx_l3_packm.h
--- a/sandbox/ref99/old/thread/blx_gemm_thread.c
+++ b/sandbox/ref99/old/thread/blx_gemm_thread.c
@@ -38,6 +38,7 @@
 // This code is enabled only when multithreading is enabled via OpenMP.
 #ifdef BLIS_ENABLE_OPENMP

+#if 0
 void blx_gemm_thread
     (
       gemmint_t func,
@@ -101,6 +102,129 @@ void blx_gemm_thread
 	// by the global communicator's chief thread in bli_l3_thrinfo_free()
 	// (called above).
 }
+#endif
+void blx_gemm_thread
+     (
+       gemmint_t  func,
+       opid_t     family,
+       obj_t*     alpha,
+       obj_t*     a,
+       obj_t*     b,
+       obj_t*     beta,
+       obj_t*     c,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       cntl_t*    cntl
+     )
+{
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( a );
+	pack_t schema_b = bli_obj_pack_schema( b );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+
+	// Query the total number of threads from the rntm_t object.
+	const dim_t n_threads = bli_rntm_num_threads( rntm );
+
+	// NOTE: The sba was initialized in bli_init().
+
+	// Check out an array_t from the small block allocator. This is done
+	// with an internal lock to ensure only one application thread accesses
+	// the sba at a time. bli_sba_checkout_array() will also automatically
+	// resize the array_t, if necessary.
+	array_t* restrict array = bli_sba_checkout_array( n_threads );
+
+	// Access the pool_t* for thread 0 and embed it into the rntm. We do
+	// this up-front only so that we have the rntm_t.sba_pool field
+	// initialized and ready for the global communicator creation below.
+	bli_sba_rntm_set_pool( 0, array, rntm );
+
+	// Set the packing block allocator field of the rntm. This will be
+	// inherited by all of the child threads when they make local copies of
+	// the rntm below.
+	bli_membrk_rntm_set_membrk( rntm );
+
+	// Allocate a global communicator for the root thrinfo_t structures.
+	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+
+
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Create a thread-local copy of the master thread's rntm_t. This is
+		// necessary since we want each thread to be able to track its own
+		// small block pool_t as it executes down the function stack.
+		rntm_t           rntm_l = *rntm;
+		rntm_t* restrict rntm_p = &rntm_l;
+
+		// Query the thread's id from OpenMP.
+		const dim_t tid = omp_get_thread_num();
+
+		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
+		//bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
+
+		// Use the thread id to access the appropriate pool_t* within the
+		// array_t, and use it to set the sba_pool field within the rntm_t.
+		// If the pool_t* element within the array_t is NULL, it will first
+		// be allocated/initialized.
+		bli_sba_rntm_set_pool( tid, array, rntm_p );
+
+
+		obj_t      a_t, b_t, c_t;
+		cntl_t*    cntl_use;
+		thrinfo_t* thread;
+
+		// Alias thread-local copies of A, B, and C. These will be the objects
+		// we pass down the algorithmic function stack. Making thread-local
+		// aliases is highly recommended in case a thread needs to change any
+		// of the properties of an object without affecting other threads'
+		// objects.
+		bli_obj_alias_to( a, &a_t );
+		bli_obj_alias_to( b, &b_t );
+		bli_obj_alias_to( c, &c_t );
+
+		// Create a default control tree for the operation, if needed.
+		blx_l3_cntl_create_if( family, schema_a, schema_b,
+		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
+
+		// Create the root node of the current thread's thrinfo_t structure.
+		blx_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+
+		func
+		(
+		  alpha,
+		  &a_t,
+		  &b_t,
+		  beta,
+		  &c_t,
+		  cntx,
+		  rntm_p,
+		  cntl_use,
+		  thread
+		);
+
+		// Free the thread's local control tree.
+		blx_l3_cntl_free( rntm_p, cntl_use, thread );
+
+		// Free the current thread's thrinfo_t structure.
+		bli_l3_thrinfo_free( rntm_p, thread );
+	}
+
+	// We shouldn't free the global communicator since it was already freed
+	// by the global communicator's chief thread in bli_l3_thrinfo_free()
+	// (called above).
+
+	// Check the array_t back into the small block allocator. Similar to the
+	// check-out, this is done using a lock embedded within the sba to ensure
+	// mutual exclusion.
+	bli_sba_checkin_array( array );
+}
+
+

 #endif

--- a/sandbox/ref99/old/thread/blx_gemm_thread.h
+++ b/sandbox/ref99/old/thread/blx_gemm_thread.h
@@ -35,8 +35,10 @@
 // gemm internal function type
 typedef void (*gemmint_t)
     (
+       obj_t*     alpha,
       obj_t*     a,
       obj_t*     b,
+       obj_t*     beta,
       obj_t*     c,
       cntx_t*    cntx,
       rntm_t*    rntm,
--- a/sandbox/ref99/old/vars/blx_gemm_blk_var1.c
+++ b/sandbox/ref99/old/vars/blx_gemm_blk_var1.c
--- a/sandbox/ref99/old/vars/blx_gemm_blk_var2.c
+++ b/sandbox/ref99/old/vars/blx_gemm_blk_var2.c
--- a/sandbox/ref99/old/vars/blx_gemm_blk_var3.c
+++ b/sandbox/ref99/old/vars/blx_gemm_blk_var3.c
--- a/sandbox/ref99/old/vars/blx_gemm_ker_var2.c
+++ b/sandbox/ref99/old/vars/blx_gemm_ker_var2.c
--- a/sandbox/ref99/old/vars/blx_gemm_packab.c
+++ b/sandbox/ref99/old/vars/blx_gemm_packab.c
--- a/sandbox/ref99/old/vars/blx_gemm_var.h
+++ b/sandbox/ref99/old/vars/blx_gemm_var.h
--- a/sandbox/ref99/old/vars/other/blx_gemm_ker_var2rr.c
+++ b/sandbox/ref99/old/vars/other/blx_gemm_ker_var2rr.c
--- a/sandbox/ref99/old/vars/other/blx_gemm_ker_var2sl.c
+++ b/sandbox/ref99/old/vars/other/blx_gemm_ker_var2sl.c