diff --git a/README.md b/README.md
index 565efb09e..a6b69a5c3 100644
--- a/README.md
+++ b/README.md
@@ -169,10 +169,14 @@ performance remain attainable.
  * **A foundation for mixed domain and/or mixed precision operations.** BLIS
 was designed with the hope of one day allowing computation on real and complex
 operands within the same operation. Similarly, we wanted to allow mixing
-operands' floating-point precisions, or both domain and precision.
-While this feature is not yet implemented, we plan to prototype and explore
-the potential for adding mixed domain, mixed precision support to operations
-such as `gemm`. 
+operands' numerical domains, floating-point precisions, or both domain and
+precision, and to optionally compute in a precision different than one or both
+operands' storage precisions. This feature has been implemented for the general
+matrix multiplication (`gemm`) operation, providing 128 different possible type
+combinations, which, when combined with existing transposition, conjugation,
+and storage parameters, enables 55,296 different `gemm` use cases. For more
+details, please see the documentation on [mixed datatype](docs/MixedDatatypes.md)
+support.
 
 Getting Started
 ---------------
@@ -230,6 +234,9 @@ included in the BLIS source distribution.
 table of supported microarchitectures.
  * **[Multithreading](docs/Multithreading.md).** This document describes how to
 use the multithreading features of BLIS.
+ * **[Mixed-Datatype](docs/MixedDatatype.md).** This document provides an
+overview of BLIS's mixed-datatype functionality and provides a brief example
+of how to take advantage of this new code.
  * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
 changes included with each new version of BLIS, along with contributor credits
 for key features.
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 97b2fcca0..b7e5adf85 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -92,6 +92,26 @@
 #endif
 #endif
 
+#ifndef BLIS_ENABLE_MIXED_DT
+#ifndef BLIS_DISABLE_MIXED_DT
+#if @enable_mixed_dt@
+#define BLIS_ENABLE_MIXED_DT
+#else
+#define BLIS_DISABLE_MIXED_DT
+#endif
+#endif
+#endif
+
+#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM
+#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM
+#if @enable_mixed_dt_extra_mem@
+#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM
+#else
+#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM
+#endif
+#endif
+#endif
+
 #if @enable_memkind@
 #define BLIS_ENABLE_MEMKIND
 #else
diff --git a/configure b/configure
index 0a8f58c0a..e5c17fd5f 100755
--- a/configure
+++ b/configure
@@ -191,6 +191,24 @@ print_usage()
 	echo "                 compatibility layer. This automatically enables the"
 	echo "                 BLAS compatibility layer as well."
 	echo " "
+	echo "   --disable-mixed-dt, --enable-mixed-dt"
+	echo " "
+	echo "                 Disable (enabled by default) support for mixing the"
+	echo "                 storage domain and/or storage precision of matrix"
+	echo "                 operands for the gemm operation, as well as support"
+	echo "                 for computing in a precision different from one or."
+	echo "                 both of matrices A and B."
+	echo " "
+	echo "   --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem"
+	echo " "
+	echo "                 Disable (enabled by default) support for additional"
+	echo "                 mixed datatype optimizations that require temporarily"
+	echo "                 allocating extra memory--specifically, a single m x n"
+	echo "                 matrix (per application thread) whose storage datatype"
+	echo "                 is equal to the computation datatype. This option may"
+	echo "                 only be enabled when mixed domain/precision support is"
+	echo "                 enabled."
+	echo " "
 	echo "   -s NAME --enable-sandbox=NAME"
 	echo " "
 	echo "                 Enable a separate sandbox implementation of gemm. This"
@@ -1605,6 +1623,8 @@ main()
 	blas_int_type_size=32
 	enable_blas='yes'
 	enable_cblas='no'
+	enable_mixed_dt='yes'
+	enable_mixed_dt_extra_mem='yes'
 	enable_memkind='' # The default memkind value is determined later on.
 	force_version='no'
 
@@ -1739,6 +1759,18 @@ main()
 					disable-cblas)
 						enable_cblas='no'
 						;;
+					enable-mixed-dt)
+						enable_mixed_dt='yes'
+						;;
+					disable-mixed-dt)
+						enable_mixed_dt='no'
+						;;
+					enable-mixed-dt-extra-mem)
+						enable_mixed_dt_extra_mem='yes'
+						;;
+					disable-mixed-dt-extra-mem)
+						enable_mixed_dt_extra_mem='no'
+						;;
 					with-memkind)
 						enable_memkind='yes'
 						;;
@@ -2414,8 +2446,35 @@ main()
 		echo "${script_name}: the CBLAS compatibility layer is disabled."
 		enable_cblas_01=0
 	fi
-	
-	# Report integer sizes
+	if [ "x${enable_mixed_dt}" = "xyes" ]; then
+		echo "${script_name}: mixed datatype support is enabled."
+
+		if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then
+			echo "${script_name}: mixed datatype optimizations requiring extra memory are enabled."
+			enable_mixed_dt_extra_mem_01=1
+		else
+			echo "${script_name}: mixed datatype optimizations requiring extra memory are disabled."
+			enable_mixed_dt_extra_mem_01=0
+		fi
+
+		enable_mixed_dt_01=1
+	else
+		echo "${script_name}: mixed datatype support is disabled."
+
+		if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then
+			echo "${script_name}: *** Mixed datatype optimizations requiring extra memory are only"
+			echo "${script_name}: *** available when mixed datatype support is also enabled."
+			echo "${script_name}: *** Please enable mixed datatype support, or disable mixed datatype"
+			echo "${script_name}: *** optimizations requiring extra memory, and re-run configure."
+			exit 1
+		else
+			enable_mixed_dt_extra_mem_01=0
+		fi
+
+		enable_mixed_dt_01=0
+	fi
+
+	# Report integer sizes.
 	if [ "x${int_type_size}" = "x32" ]; then
 		echo "${script_name}: the internal integer size is 32-bit."
 	elif [ "x${int_type_size}" = "x64" ]; then
@@ -2595,6 +2654,8 @@ main()
 		| sed   -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
 		| sed   -e "s/@enable_blas@/${enable_blas_01}/g" \
 		| sed   -e "s/@enable_cblas@/${enable_cblas_01}/g" \
+		| sed   -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
+		| sed   -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
 		| sed   -e "s/@enable_memkind@/${enable_memkind_01}/g" \
 		| sed   -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \
 		| sed   -e "s/@enable_shared@/${enable_shared_01}/g" \
diff --git a/docs/MixedDatatypes.md b/docs/MixedDatatypes.md
new file mode 100644
index 000000000..90c2a8703
--- /dev/null
+++ b/docs/MixedDatatypes.md
@@ -0,0 +1,217 @@
+## Contents
+
+* **[Contents](MixedDatatypes.md#contents)**
+* **[Introduction](MixedDatatypes.md#introduction)**
+* **[Categories of mixed datatypes](MixedDatatypes.md#categories-of-mixed-datatypes)**
+  * **[Computation precision](MixedDatatypes.md#computation-precision)**
+  * **[Computation domain](MixedDatatypes.md#computation-domain)**
+* **[Performing gemm with mixed datatypes](MixedDatatypes.md#performing-gemm-with-mixed-datatypes)**
+* **[Known Issues](MixedDatatypes.md#known-issues)**
+* **[Conclusion](MixedDatatypes.md#conclusion)**
+
+## Introduction
+
+This document serves as a guide to users interested in taking advantage of
+BLIS's support for performing the `gemm` operation on operands of differing
+types.
+
+## Categories of mixed datatypes
+
+Before going any further, we find it useful to categorize mixed datatype
+support into four categories:
+
+1. **Fully identical datatypes.** This is what people generally think of when
+they think about the `gemm` operation: all operands are stored in the same
+datatype (precision and domain), and the matrix product computation is
+performed in the arithmetic represented by that datatype. (This category
+doesn't actually involve mixing datatypes, but it's still worthwhile to
+define.)
+Example: matrix C updated by the product of matrix A and matrix B
+(all matrices double-precision real).
+
+2. **Mixed domain with identical precisions.** This category includes all
+combinations of datatypes where the domain (real or complex) of each
+operand may vary while the precisions (single or double precision) are
+held constant across all operands.
+Example: complex matrix C updated by the product of real matrix A and
+complex matrix B (all matrices single-precision).
+
+3. **Mixed precision within a single domain.** Here, all operands are stored
+in the same domain (real or complex), however, the precision of each operand
+may vary.
+Example: double-precision real matrix C updated by the product of
+single-precision real matrix A and single-precision real matrix B.
+
+4. **Mixed precision and mixed domain.** This category allows both domains and
+precision of each matrix operand to vary.
+Example: double-precision complex matrix C updated by the product of
+single-precision complex matrix A and single-precision real matrix B.
+
+BLIS's implementation of mixed-datatype `gemm` supports all combinations
+within all four categories.
+
+### Computation precision
+
+Because categories 3 and 4 involve mixing precisions, they come with an added
+parameter: the *computation precision*. This parameter specifies the precision
+in which the matrix multiplication (product) takes place. This precision
+can be different than the storage precision of matrices A or B, and/or the
+storage precision of matrix C.
+
+When the computation precision differs from the storage precision of matrix A,
+it implies that a typecast must occur when BLIS packs matrix A to contiguous
+storage. Similarly, B may also need to be typecast during packing.
+
+When the computation precision differs from the storage precision of C, it
+means the result of the matrix product A*B must be typecast just before it
+is accumulated back into matrix C.
+
+### Computation domain
+
+In addition to the computation precision, we also track a computation domain.
+(Together, they form the computation datatype.) However, for now we do not
+allow the user to explicitly specify the computation domain. Instead, the
+computation domain is implied by the domains of A, B, and C. The following
+table enumerates the six cases where there is at least one operand of each
+domain, along with the corresponding same-domain cases from category 1 for
+reference. We also list the total number of floating-point operations
+performed in each case.
+In the table, an 'R' denotes a real domain matrix operand while a 'C' denotes
+a matrix in the complex domain. The R's and C's appear in the following
+format of C += A * B, where A, B, and C are the matrix operands of `gemm`.
+
+| Case # | Mixed domain case | Implied computation domain | flops performed |
+|--------|:-----------------:|:--------------------------:|:---------------:|
+|   1    | R += R * R        |          real              |     2mnk        |
+|   2    | R += R * C        |          real              |     2mnk        |
+|   3    | R += C * R        |          real              |     2mnk        |
+|   4    | R += C * C        |       complex              |     4mnk        |
+|   5    | C += R * R        |          real              |     2mnk        |
+|   6    | C += R * C        |       complex              |     4mnk        |
+|   7    | C += C * R        |       complex              |     4mnk        |
+|   8    | C += C * C        |       complex              |     8mnk        |
+
+The computation domain is implied in cases 1 and 8 in the same way that
+it would be if mixed datatype support were absent entirely. These
+cases execute 2mnk and 8mnk flops, respectively, as any traditional
+implementation would.
+
+In cases 2 and 3, we assume the computation domain is real because only
+B or A, respectively, is complex. Thus, in these cases, the imaginary
+components of the complex matrix are ignored, allowing us to perform
+only 2mnk flops.
+
+In case 5, we take the computation domain to be real because A and B are
+both real, and thus it makes no sense to compute in the complex domain.
+This means that we need only update the real components of C, leaving
+the imaginary components untouched. This also results in 2mnk flops
+being performed.
+
+In case 4, we have complex A and B, allowing us to compute a complex
+product. However, we can only save the real part of that complex product
+since the output matrix C is real. Since we cannot update the imaginary
+component of C (since it is not stored), we avoid computing that half of
+the update entirely, reducing the flops performed to 4mnk. (Alternatively,
+one may wish to request real domain computation, in which case the
+imaginary components of A and B were ignored *prior* to computing the
+matrix product. This approach would result in only 2mnk flops being
+performed.)
+
+In case 6, we wish for both the real and imaginary parts of B to participate
+in the multiplication by A, with the result updating the corresponding real
+and imaginary parts of C. Granted, the imaginary part of A is zero, and this
+is taken advantage of in the computation to optimize performance, as indicated
+by the 4mnk flop count. But fundamentally this computation executes in the
+complex domain because both the real and imaginary parts of C are updated.
+A similar story can be told about case 7.
+
+## Performing gemm with mixed datatypes
+
+In BLIS, performing a mixed-datatype `gemm` operation is easy. However,
+it will require that the user call `gemm` through BLIS's object API.
+For a basic series of examples for using the object-based API, please
+see the example codes in the `examples/oapi` directory of the BLIS source
+distribution.
+
+The first step is to ensure that BLIS is configured with mixed datatype support.
+Please consult with your current distribution's `configure` script for the
+current semantics:
+```
+$ ./configure --help
+```
+As of this writing, mixed datatype support is enabled by default, and thus
+no additional options are needed.
+
+With mixed datatype support enabled in BLIS, using the functionality is
+simply a matter of creating and initializing matrices of different precisions
+and/or domains.
+```c
+dim_t  m = 5, n = 4, k = 2;
+obj_t  a, b, c;
+obj_t* alpha;
+obj_t* beta;
+
+bli_obj_create( BLIS_DOUBLE,   m, k, 0, 0, &a );
+bli_obj_create( BLIS_FLOAT,    k, n, 0, 0, &b );
+bli_obj_create( BLIS_SCOMPLEX, m, n, 0, 0, &c );
+
+alpha = &BLIS_ONE;
+beta  = &BLIS_ONE;
+
+bli_randm( &a );
+bli_randm( &b );
+bli_randm( &c );
+```
+Then, you specify the computation precision by setting the computation
+precision property of matrix C.
+```c
+bli_obj_set_comp_prec( BLIS_DOUBLE_PREC, &c );
+```
+If you do not explicitly specify the computation precision, it will default
+to the *storage* precision of C.
+
+With the objects created and the computation precision specified, call
+`bli_gemm()` just as you would if the datatypes were identical:
+```c
+bli_gemm( alpha, &a, &b, beta, &c );
+```
+For more examples of using BLIS's object-based API, including methods
+of initializing an matrix object with arbitrary values, please review the
+example code found in the `examples/oapi` directory of the BLIS source
+distribution.
+
+## Known Issues
+
+While BLIS implements 128 mixed-datatype combinations of `gemm`, there may be
+odd behavior in the current implementation that does not conform to the reader's
+expectations. Below is a list of issues that BLIS developers are aware of in
+the context of mixed-datatype `gemm`. If any of these issues poses a problem for
+your application, please contact us by
+[opening an issue](https://github.com/flame/blis/issues).
+
+* **alpha with non-zero imaginary components.** Currently, there are many cases
+of mixed-datatype `gemm` that do not yet support computing with `alpha` scalars
+that have non-zero imaginary components--in other words, values of `alpha` that
+are not in the real domain. (By contrast, non-real values for `beta` are fully
+supported.) In order to support these use cases, additional code complexity and
+logic would be required. Thus, we have chosen, for now, to not implement them.
+If mixed-datatype `gemm` is invoked with a non-real valued `alpha` scalar, a
+runtime error message will be printed and the linked program will abort.
+
+* **Manually specifying the computation domain.** As mentioned in the section
+discussing the [computation domain](MixedDatatype.md#computation-domain),
+the computation domain of any case of mixed domain `gemm` is implied by the
+operands and thus fixed; the user may not specify a different computation
+domain, even if the mixed-domain case would reasonably allow for computing
+in either domain.
+
+## Conclusion
+
+For more information and documentation on BLIS, please visit the [BLIS github page](https://github.com/flame/blis/).
+
+If you found a bug or wish to request a feature, please [open an issue](https://github.com/flame/blis/issues).
+
+For general discussion or questions, please join and post a message to the [blis-devel mailing list](http://groups.google.com/group/blis-devel).
+
+Thanks for your interest in BLIS!
+
diff --git a/examples/oapi/11gemm_md.c b/examples/oapi/11gemm_md.c
new file mode 100644
index 000000000..8ae40c1f4
--- /dev/null
+++ b/examples/oapi/11gemm_md.c
@@ -0,0 +1,269 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdio.h>
+#include "blis.h"
+
+int main( int argc, char** argv )
+{
+	num_t dt_r, dt_c;
+	num_t dt_s, dt_d;
+	num_t dt_a, dt_b;
+	dim_t m, n, k;
+	inc_t rs, cs;
+
+	obj_t a, b, c;
+	obj_t* alpha;
+	obj_t* beta;
+
+	//
+	// This file demonstrates mixing datatypes in gemm.
+	//
+	// NOTE: Please make sure that mixed datatype support is enabled in BLIS
+	// before proceeding to build and run the example binaries. If you're not
+	// sure whether mixed datatype support is enabled in BLIS, please refer
+	// to './configure --help' for the relevant options.
+	//
+
+	//
+	// Example 1: Perform a general matrix-matrix multiply (gemm) operation
+	//            with operands of different domains (but identical precisions).
+	//
+
+	printf( "\n#\n#  -- Example 1 --\n#\n\n" );
+
+	// Create some matrix operands to work with.
+	dt_r = BLIS_DOUBLE;
+	dt_c = BLIS_DCOMPLEX;
+	m = 4; n = 5; k = 1; rs = 0; cs = 0;
+	bli_obj_create( dt_c, m, n, rs, cs, &c );
+	bli_obj_create( dt_r, m, k, rs, cs, &a );
+	bli_obj_create( dt_c, k, n, rs, cs, &b );
+
+	// Set the scalars to use.
+	alpha = &BLIS_ONE;
+	beta  = &BLIS_ONE;
+
+	// Initialize the matrix operands.
+	bli_randm( &a );
+	bli_randm( &b );
+	bli_setm( &BLIS_ZERO, &c );
+
+	bli_printm( "a (double real):    randomized", &a, "%4.1f", "" );
+	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
+	bli_printm( "c (double complex): initial value", &c, "%4.1f", "" );
+
+	// c := beta * c + alpha * a * b, where 'a' is real, and 'b' and 'c' are
+	// complex.
+	bli_gemm( alpha, &a, &b, beta, &c );
+
+	bli_printm( "c (double complex): after gemm", &c, "%4.1f", "" );
+
+	// Free the objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+	bli_obj_free( &c );
+
+	//
+	// Example 2: Perform a general matrix-matrix multiply (gemm) operation
+	//            with operands of different precisions (but identical domains).
+	//
+
+	printf( "\n#\n#  -- Example 2 --\n#\n\n" );
+
+	// Create some matrix operands to work with.
+	dt_s = BLIS_FLOAT;
+	dt_d = BLIS_DOUBLE;
+	m = 4; n = 5; k = 1; rs = 0; cs = 0;
+	bli_obj_create( dt_d, m, n, rs, cs, &c );
+	bli_obj_create( dt_s, m, k, rs, cs, &a );
+	bli_obj_create( dt_s, k, n, rs, cs, &b );
+
+	// Notice that we've chosen C to be double-precision real and A and B to be
+	// single-precision real.
+
+	// Since we are mixing precisions, we will also need to specify the
+	// so-called "computation precision." That is, we need to signal to
+	// bli_gemm() whether we want the A*B product to be computed in single
+	// precision or double precision (prior to the result being accumulated
+	// back to C). To specify the computation precision, we need to set the
+	// corresponding bit in the C object. Here, we specify double-precision
+	// computation.
+	// NOTE: If you do not explicitly specify the computation precision, it
+	// will default to the storage precision of the C object.
+	bli_obj_set_comp_prec( BLIS_DOUBLE_PREC, &c );
+
+	// Initialize the matrix operands.
+	bli_randm( &a );
+	bli_randm( &b );
+	bli_setm( &BLIS_ZERO, &c );
+
+	bli_printm( "a (single real): randomized", &a, "%4.1f", "" );
+	bli_printm( "b (single real): randomized", &b, "%4.1f", "" );
+	bli_printm( "c (double real): initial value", &c, "%4.1f", "" );
+
+	// c := beta * c + alpha * a * b, where 'a' and 'b' are single-precision
+	// real, 'c' is double-precision real, and the matrix product is performed
+	// in double-precision arithmetic.
+	bli_gemm( alpha, &a, &b, beta, &c );
+
+	bli_printm( "c (double real): after gemm (exec prec = double precision)", &c, "%4.1f", "" );
+
+	// Free the objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+	bli_obj_free( &c );
+
+	//
+	// Example 3: Perform a general matrix-matrix multiply (gemm) operation
+	//            with operands of different domains AND precisions.
+	//
+
+	printf( "\n#\n#  -- Example 3 --\n#\n\n" );
+
+	// Create some matrix operands to work with.
+	dt_a = BLIS_FLOAT;
+	dt_b = BLIS_DCOMPLEX;
+	dt_c = BLIS_SCOMPLEX;
+	m = 4; n = 5; k = 1; rs = 0; cs = 0;
+	bli_obj_create( dt_c, m, n, rs, cs, &c );
+	bli_obj_create( dt_a, m, k, rs, cs, &a );
+	bli_obj_create( dt_b, k, n, rs, cs, &b );
+
+	// Notice that we've chosen C to be single-precision complex, and A to be
+	// single-precision real, and B to be double-precision complex.
+
+	// Set the computation precision to single precision this time.
+	bli_obj_set_comp_prec( BLIS_SINGLE_PREC, &c );
+
+	// Initialize the matrix operands.
+	bli_randm( &a );
+	bli_randm( &b );
+	bli_setm( &BLIS_ZERO, &c );
+
+	bli_printm( "a (single real): randomized", &a, "%4.1f", "" );
+	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
+	bli_printm( "c (single complex): initial value", &c, "%4.1f", "" );
+
+	// c := beta * c + alpha * a * b, where 'a' is single-precision real, 'b'
+	// is double-precision complex, 'c' is single-precision complex, and the
+	// matrix product is performed in single-precision arithmetic.
+	bli_gemm( alpha, &a, &b, beta, &c );
+
+	bli_printm( "c (single complex): after gemm (exec prec = single precision)", &c, "%4.1f", "" );
+
+	// Free the objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+	bli_obj_free( &c );
+
+	//
+	// Example 4: Project objects between the real and complex domains.
+	//
+
+	printf( "\n#\n#  -- Example 4 --\n#\n\n" );
+
+	// Create some matrix operands to work with.
+	dt_r = BLIS_DOUBLE;
+	dt_c = BLIS_DCOMPLEX;
+	m = 4; n = 5; rs = 0; cs = 0;
+	bli_obj_create( dt_r, m, n, rs, cs, &a );
+	bli_obj_create( dt_c, m, n, rs, cs, &b );
+
+	// Initialize a real matrix A.
+	bli_randm( &a );
+
+	bli_printm( "a (double real): randomized", &a, "%4.1f", "" );
+
+	// Project real matrix A to the complex domain (in B).
+	bli_projm( &a, &b );
+
+	bli_printm( "b (double complex): projected from 'a'", &b, "%4.1f", "" );
+
+	// Notice how the imaginary components in B are zero since any real
+	// matrix implicitly has imaginary values that are equal to zero.
+
+	// Now let's project in the other direction.
+
+	// Initialize the complex matrix B.
+	bli_randm( &b );
+
+	bli_printm( "b (double complex): randomized", &b, "%4.1f", "" );
+
+	// Project complex matrix B to the real domain (in A).
+	bli_projm( &b, &a );
+
+	bli_printm( "a (double real): projected from 'b'", &a, "%4.1f", "" );
+
+	// Notice how the imaginary components are lost in the projection from
+	// the complex domain to the real domain.
+
+	// Free the objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+
+	//
+	// Example 5: Typecast objects between the single and double precisions.
+	//
+
+	printf( "\n#\n#  -- Example 5 --\n#\n\n" );
+
+	// Create some matrix operands to work with.
+	dt_s = BLIS_FLOAT;
+	dt_d = BLIS_DOUBLE;
+	m = 4; n = 3; rs = 0; cs = 0;
+	bli_obj_create( dt_d, m, n, rs, cs, &a );
+	bli_obj_create( dt_s, m, n, rs, cs, &b );
+
+	// Initialize a double-precision real matrix A.
+	bli_randm( &a );
+
+	bli_printm( "a (double real): randomized", &a, "%23.16e", "" );
+
+	// Typecast A to single precision.
+	bli_castm( &a, &b );
+
+	bli_printm( "b (single real): typecast from 'a'", &b, "%23.16e", "" );
+
+	// Notice how the values in B are only accurate to the 6th or 7th decimal
+	// place relative to the true values in A.
+
+	// Free the objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+
+
+	return 0;
+}
+
diff --git a/examples/oapi/Makefile b/examples/oapi/Makefile
index a8373c448..7f622bf5e 100644
--- a/examples/oapi/Makefile
+++ b/examples/oapi/Makefile
@@ -127,7 +127,8 @@ TEST_BINS      := 00obj_basic.x \
                   07level1m_diag.x \
                   08level2.x \
                   09level3.x \
-                  10util.x
+                  10util.x \
+                  11gemm_md.x
 
 
 
diff --git a/frame/0/bli_l0_fpa.c b/frame/0/bli_l0_fpa.c
index 37c4a5dfb..75db984f9 100644
--- a/frame/0/bli_l0_fpa.c
+++ b/frame/0/bli_l0_fpa.c
@@ -41,7 +41,7 @@
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA( void*, opname ); \
+GENARRAY_FPA( PASTECH(opname,_vft), opname ); \
 \
 PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \
 { \
@@ -63,7 +63,7 @@ GENFRONT( zipsc )
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
-GENARRAY_FPA_I( void*, opname ); \
+GENARRAY_FPA_I( PASTECH(opname,_vft), opname ); \
 \
 PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \
 { \
diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c
index ca1e4657f..dc6c44aec 100644
--- a/frame/1d/bli_l1d_check.c
+++ b/frame/1d/bli_l1d_check.c
@@ -103,6 +103,22 @@ GENFRONT( setd )
 GENFRONT( setid )
 
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,_check) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+     ) \
+{ \
+	bli_l1d_axy_check( beta, x, y ); \
+}
+
+GENFRONT( xpbyd )
+
+
 // -----------------------------------------------------------------------------
 
 void bli_l1d_xy_check
diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h
index cd015f919..df5ea1f17 100644
--- a/frame/1d/bli_l1d_check.h
+++ b/frame/1d/bli_l1d_check.h
@@ -90,6 +90,19 @@ GENTPROT( setd )
 GENTPROT( setid )
 
 
+#undef  GENTPROT
+#define GENTPROT( opname ) \
+\
+void PASTEMAC(opname,_check) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+    );
+
+GENTPROT( xpbyd )
+
+
 // -----------------------------------------------------------------------------
 
 void bli_l1d_xy_check
diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c
index e244f27b8..6c57b1ab3 100644
--- a/frame/1d/bli_l1d_fpa.c
+++ b/frame/1d/bli_l1d_fpa.c
@@ -59,4 +59,5 @@ GENFRONT( invertd )
 GENFRONT( scald )
 GENFRONT( setd )
 GENFRONT( setid )
+GENFRONT( xpbyd )
 
diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h
index 9eb66cdd3..915c2eb33 100644
--- a/frame/1d/bli_l1d_fpa.h
+++ b/frame/1d/bli_l1d_fpa.h
@@ -51,3 +51,5 @@ GENPROT( invertd )
 GENPROT( scald )
 GENPROT( setd )
 GENPROT( setid )
+GENPROT( xpbyd )
+
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index debea1e62..5f6a487cf 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -131,3 +131,23 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
 
 INSERT_GENTDEFR( setid )
 
+// xpbyd
+
+#undef  GENTDEF
+#define GENTDEF( ctype, ch, opname, tsuf ) \
+\
+typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     );
+
+INSERT_GENTDEF( xpbyd )
+
diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c
index 9395b129b..a9a445e9b 100644
--- a/frame/1d/bli_l1d_oapi.c
+++ b/frame/1d/bli_l1d_oapi.c
@@ -312,5 +312,70 @@ void PASTEMAC(opname,EX_SUF) \
 GENFRONT( setid )
 
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+       BLIS_OAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_OAPI_EX_DECLS \
+\
+	num_t     dt        = bli_obj_dt( x ); \
+\
+	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
+	diag_t    diagx     = bli_obj_diag( x ); \
+	trans_t   transx    = bli_obj_conjtrans_status( x ); \
+	dim_t     m         = bli_obj_length( y ); \
+	dim_t     n         = bli_obj_width( y ); \
+	void*     buf_x     = bli_obj_buffer_at_off( x ); \
+	inc_t     rs_x      = bli_obj_row_stride( x ); \
+	inc_t     cs_x      = bli_obj_col_stride( x ); \
+	void*     buf_y     = bli_obj_buffer_at_off( y ); \
+	inc_t     rs_y      = bli_obj_row_stride( y ); \
+	inc_t     cs_y      = bli_obj_col_stride( y ); \
+\
+	void*     buf_beta; \
+\
+	obj_t     beta_local; \
+\
+	if ( bli_error_checking_is_enabled() ) \
+	    PASTEMAC(opname,_check)( x, beta, y ); \
+\
+	/* Create local copy-casts of scalars (and apply internal conjugation
+	   as needed). */ \
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
+	                                      beta, &beta_local ); \
+	buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
+\
+	/* Query a type-specific function pointer, except one that uses
+	   void* instead of typed pointers. */ \
+	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+\
+	f \
+	( \
+	   diagoffx, \
+	   diagx, \
+	   transx, \
+	   m, \
+	   n, \
+	   buf_x, rs_x, cs_x, \
+	   buf_beta, \
+	   buf_y, rs_y, cs_y, \
+	   cntx, \
+	   rntm  \
+	); \
+}
+
+GENFRONT( xpbyd )
+
+
 #endif
 
diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h
index 319896ead..48eedfc63 100644
--- a/frame/1d/bli_l1d_oapi.h
+++ b/frame/1d/bli_l1d_oapi.h
@@ -93,3 +93,17 @@ GENTPROT( scald )
 GENTPROT( setd )
 GENTPROT( setid )
 
+
+#undef  GENTPROT
+#define GENTPROT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+       BLIS_OAPI_EX_PARAMS  \
+     );
+
+GENTPROT( xpbyd )
+
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index b6a24a604..f20269291 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -387,5 +387,83 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
 
 
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
+\
+void PASTEMAC2(ch,opname,EX_SUF) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_TAPI_EX_DECLS \
+\
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*      x1; \
+	ctype*      y1; \
+	conj_t      conjx; \
+	dim_t       n_elem; \
+	dim_t       offx, offy; \
+	inc_t       incx, incy; \
+\
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \
+\
+	/* Determine the distance to the diagonals, the number of diagonal
+	   elements, and the diagonal increments. */ \
+	bli_set_dims_incs_2d \
+	( \
+	  diagoffx, transx, \
+	  m, n, rs_x, cs_x, rs_y, cs_y, \
+	  &offx, &offy, &n_elem, &incx, &incy \
+	); \
+\
+	conjx = bli_extract_conj( transx ); \
+\
+	if ( bli_is_nonunit_diag( diagx ) ) \
+	{ \
+	    x1   = x + offx; \
+	    y1   = y + offy; \
+	} \
+	else /* if ( bli_is_unit_diag( diagx ) ) */ \
+	{ \
+	    /* Simulate a unit diagonal for x with a zero increment over a unit
+	       scalar. */ \
+	    x1   = PASTEMAC(ch,1); \
+	    incx = 0; \
+	    y1   = y + offy; \
+	} \
+\
+	/* Obtain a valid context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* Query the context for the operation's kernel address. */ \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+\
+	/* Invoke the kernel with the appropriate parameters. */ \
+	f( \
+	   conjx, \
+	   n_elem, \
+	   x1, incx, \
+	   beta, \
+	   y1, incy, \
+	   cntx  \
+	 ); \
+}
+
+INSERT_GENTFUNC_BASIC2( xpbyd,  xpbyv,  BLIS_XPBYV_KER )
+
+
 #endif
 
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 4314af5bc..5065768d1 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -125,3 +125,22 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 
 INSERT_GENTPROTR_BASIC0( setid )
 
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC2(ch,opname,EX_SUF) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     );
+
+INSERT_GENTPROT_BASIC0( xpbyd )
+
diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c
index df811cae3..a6115e2a0 100644
--- a/frame/1m/bli_l1m_check.c
+++ b/frame/1m/bli_l1m_check.c
@@ -88,6 +88,22 @@ GENFRONT( scalm )
 GENFRONT( setm )
 
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,_check) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+     ) \
+{ \
+	bli_l1m_axy_check( beta, x, y ); \
+}
+
+GENFRONT( xpbym )
+
+
 // -----------------------------------------------------------------------------
 
 void bli_l1m_xy_check
diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h
index 2e67e0674..90cf497b9 100644
--- a/frame/1m/bli_l1m_check.h
+++ b/frame/1m/bli_l1m_check.h
@@ -78,6 +78,19 @@ GENPROT( scalm )
 GENPROT( setm )
 
 
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTEMAC(opname,_check) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+    );
+
+GENPROT( xpbym )
+
+
 // -----------------------------------------------------------------------------
 
 void bli_l1m_xy_check
diff --git a/frame/1m/bli_l1m_fpa.c b/frame/1m/bli_l1m_fpa.c
index 46b0d5c37..0f60cecf7 100644
--- a/frame/1m/bli_l1m_fpa.c
+++ b/frame/1m/bli_l1m_fpa.c
@@ -57,4 +57,23 @@ GENFRONT( axpym )
 GENFRONT( scal2m )
 GENFRONT( scalm )
 GENFRONT( setm )
+GENFRONT( xpbym )
+
+//
+// Define function pointer query interfaces for two-datatype operations.
+//
+
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+GENARRAY_FPA2( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
+               PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
+\
+PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \
+{ \
+	return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \
+}
+
+GENFRONT( xpbym_md )
 
diff --git a/frame/1m/bli_l1m_fpa.h b/frame/1m/bli_l1m_fpa.h
index 3e07bf38d..076e2dec0 100644
--- a/frame/1m/bli_l1m_fpa.h
+++ b/frame/1m/bli_l1m_fpa.h
@@ -49,4 +49,13 @@ GENPROT( axpym )
 GENPROT( scal2m )
 GENPROT( scalm )
 GENPROT( setm )
+GENPROT( xpbym )
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
+PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty );
+
+GENPROT( xpbym_md )
 
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index 9e7c9675e..593882ed0 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -141,3 +141,25 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
 INSERT_GENTDEF( scalm )
 INSERT_GENTDEF( setm )
 
+// xpbym
+
+#undef  GENTDEF
+#define GENTDEF( ctype, ch, opname, tsuf ) \
+\
+typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     );
+
+INSERT_GENTDEF( xpbym )
+INSERT_GENTDEF( xpbym_md )
+
diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c
index 190d7857a..f66de309c 100644
--- a/frame/1m/bli_l1m_oapi.c
+++ b/frame/1m/bli_l1m_oapi.c
@@ -302,5 +302,141 @@ void PASTEMAC(opname,EX_SUF) \
 GENFRONT( setm )
 
 
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+       BLIS_OAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_OAPI_EX_DECLS \
+\
+	if ( bli_obj_dt( x ) != bli_obj_dt( y ) ) \
+		return bli_xpbym_md( x, beta, y ); \
+\
+	num_t     dt        = bli_obj_dt( x ); \
+\
+	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
+	diag_t    diagx     = bli_obj_diag( x ); \
+	uplo_t    uplox     = bli_obj_uplo( x ); \
+	trans_t   transx    = bli_obj_conjtrans_status( x ); \
+	dim_t     m         = bli_obj_length( y ); \
+	dim_t     n         = bli_obj_width( y ); \
+	void*     buf_x     = bli_obj_buffer_at_off( x ); \
+	inc_t     rs_x      = bli_obj_row_stride( x ); \
+	inc_t     cs_x      = bli_obj_col_stride( x ); \
+	void*     buf_y     = bli_obj_buffer_at_off( y ); \
+	inc_t     rs_y      = bli_obj_row_stride( y ); \
+	inc_t     cs_y      = bli_obj_col_stride( y ); \
+\
+	void*     buf_beta; \
+\
+	obj_t     beta_local; \
+\
+	if ( bli_error_checking_is_enabled() ) \
+	    PASTEMAC(opname,_check)( x, beta, y ); \
+\
+	/* Create local copy-casts of scalars (and apply internal conjugation
+	   as needed). */ \
+	bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
+	                                      beta, &beta_local ); \
+	buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
+\
+	/* Query a type-specific function pointer, except one that uses
+	   void* instead of typed pointers. */ \
+	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
+\
+	f \
+    ( \
+	   diagoffx, \
+	   diagx, \
+	   uplox, \
+	   transx, \
+	   m, \
+	   n, \
+	   buf_x, rs_x, cs_x, \
+	   buf_beta, \
+	   buf_y, rs_y, cs_y, \
+	   cntx, \
+	   rntm  \
+	); \
+}
+
+GENFRONT( xpbym )
+
+
+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+       BLIS_OAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_OAPI_EX_DECLS \
+\
+	num_t     dtx       = bli_obj_dt( x ); \
+	num_t     dty       = bli_obj_dt( y ); \
+\
+	doff_t    diagoffx  = bli_obj_diag_offset( x ); \
+	diag_t    diagx     = bli_obj_diag( x ); \
+	uplo_t    uplox     = bli_obj_uplo( x ); \
+	trans_t   transx    = bli_obj_conjtrans_status( x ); \
+	dim_t     m         = bli_obj_length( y ); \
+	dim_t     n         = bli_obj_width( y ); \
+	void*     buf_x     = bli_obj_buffer_at_off( x ); \
+	inc_t     rs_x      = bli_obj_row_stride( x ); \
+	inc_t     cs_x      = bli_obj_col_stride( x ); \
+	void*     buf_y     = bli_obj_buffer_at_off( y ); \
+	inc_t     rs_y      = bli_obj_row_stride( y ); \
+	inc_t     cs_y      = bli_obj_col_stride( y ); \
+\
+	void*     buf_beta; \
+\
+	obj_t     beta_local; \
+\
+	/* Create local copy-casts of scalars (and apply internal conjugation
+	   as needed). */ \
+	bli_obj_scalar_init_detached_copy_of( dty, BLIS_NO_CONJUGATE, \
+	                                      beta, &beta_local ); \
+	buf_beta = bli_obj_buffer_for_1x1( dty, &beta_local ); \
+\
+	/* Query a (multi) type-specific function pointer, except one that uses
+	   void* instead of typed pointers. */ \
+	PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
+	PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \
+\
+	f \
+	( \
+	   diagoffx, \
+	   diagx, \
+	   uplox, \
+	   transx, \
+	   m, \
+	   n, \
+	   buf_x, rs_x, cs_x, \
+	   buf_beta, \
+	   buf_y, rs_y, cs_y, \
+	   cntx, \
+	   rntm  \
+	); \
+}
+
+GENFRONT( xpbym_md )
+
+
+
 #endif
 
diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h
index 2f72f1167..4a42ab00d 100644
--- a/frame/1m/bli_l1m_oapi.h
+++ b/frame/1m/bli_l1m_oapi.h
@@ -80,3 +80,18 @@ void PASTEMAC(opname,EX_SUF) \
 GENPROT( scalm )
 GENPROT( setm )
 
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  x, \
+       obj_t*  beta, \
+       obj_t*  y  \
+       BLIS_OAPI_EX_PARAMS  \
+     );
+
+GENPROT( xpbym )
+GENPROT( xpbym_md )
+
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index d852c4794..fb7173c47 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -382,5 +382,155 @@ INSERT_GENTFUNC_BASIC0( scalm )
 INSERT_GENTFUNC_BASIC0( setm )
 
 
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTEMAC2(ch,opname,EX_SUF) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_TAPI_EX_DECLS \
+\
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* Obtain a valid context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* If beta is zero, then the operation reduces to copym. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC2(ch,copym,_unb_var1) \
+		( \
+		  diagoffx, \
+		  diagx, \
+		  uplox, \
+		  transx, \
+		  m, \
+		  n, \
+		  x, rs_x, cs_x, \
+		  y, rs_y, cs_y, \
+		  cntx, \
+		  rntm  \
+		); \
+\
+		return; \
+	} \
+\
+	/* Invoke the helper variant, which loops over the appropriate kernel
+	   to implement the current operation. */ \
+	PASTEMAC2(ch,opname,_unb_var1) \
+	( \
+	  diagoffx, \
+	  diagx, \
+	  uplox, \
+	  transx, \
+	  m, \
+	  n, \
+	  x, rs_x, cs_x, \
+	  beta, \
+	  y, rs_y, cs_y, \
+	  cntx, \
+	  rntm  \
+	); \
+\
+	/* When the diagonal of an upper- or lower-stored matrix is unit,
+	   we handle it with a separate post-processing step. */ \
+	if ( bli_is_upper_or_lower( uplox ) && \
+	     bli_is_unit_diag( diagx ) ) \
+	{ \
+		PASTEMAC2(ch,xpbyd,BLIS_TAPI_EX_SUF) \
+		( \
+		  diagoffx, \
+		  diagx, \
+		  transx, \
+		  m, \
+		  n, \
+		  x, rs_x, cs_x, \
+		  beta, \
+		  y, rs_y, cs_y, \
+		  cntx, \
+		  rntm  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( xpbym )
+
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
+\
+void PASTEMAC3(chx,chy,opname,EX_SUF) \
+     ( \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       trans_t  transx, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       ctype_y* beta, \
+       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_TAPI_EX_DECLS \
+\
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* Obtain a valid context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* If beta is zero, then the operation reduces to copym. */ \
+	if ( PASTEMAC(chy,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC2(chx,chy,castm) \
+		( \
+		  transx, \
+		  m, \
+		  n, \
+		  x, rs_x, cs_x, \
+		  y, rs_y, cs_y  \
+		); \
+\
+		return; \
+	} \
+\
+	/* Invoke the helper variant, which loops over the appropriate kernel
+	   to implement the current operation. */ \
+	PASTEMAC3(chx,chy,opname,_unb_var1) \
+	( \
+	  diagoffx, \
+	  diagx, \
+	  uplox, \
+	  transx, \
+	  m, \
+	  n, \
+	  x, rs_x, cs_x, \
+	  beta, \
+	  y, rs_y, cs_y, \
+	  cntx, \
+	  rntm  \
+	); \
+}
+
+INSERT_GENTFUNC2_BASIC0( xpbym_md )
+INSERT_GENTFUNC2_MIXDP0( xpbym_md )
+
+
 #endif
 
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index cacf93394..ccd2f77a9 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -98,3 +98,44 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 INSERT_GENTPROT_BASIC0( scalm )
 INSERT_GENTPROT_BASIC0( setm )
 
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC2(ch,opname,EX_SUF) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     );
+
+INSERT_GENTPROT_BASIC0( xpbym )
+
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
+\
+void PASTEMAC3(chx,chy,opname,EX_SUF) \
+     ( \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       trans_t  transx, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       ctype_y* beta, \
+       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+       BLIS_TAPI_EX_PARAMS  \
+     );
+
+INSERT_GENTPROT2_BASIC0( xpbym_md )
+INSERT_GENTPROT2_MIXDP0( xpbym_md )
+
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index 2e80a5998..3d08db661 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -378,3 +378,252 @@ void PASTEMAC(ch,opname) \
 INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER )
 INSERT_GENTFUNC_BASIC2( setm_unb_var1,  setv,  BLIS_SETV_KER )
 
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*   x1; \
+	ctype*   y1; \
+	uplo_t   uplox_eff; \
+	conj_t   conjx; \
+	dim_t    n_iter; \
+	dim_t    n_elem, n_elem_max; \
+	inc_t    ldx, incx; \
+	inc_t    ldy, incy; \
+	dim_t    j, i; \
+	dim_t    ij0, n_shift; \
+\
+	/* Set various loop parameters. */ \
+	bli_set_dims_incs_uplo_2m \
+	( \
+	  diagoffx, diagx, transx, \
+	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
+	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
+	  &ij0, &n_shift \
+	); \
+\
+	if ( bli_is_zeros( uplox_eff ) ) return; \
+\
+	/* Extract the conjugation component from the transx parameter. */ \
+	conjx = bli_extract_conj( transx ); \
+\
+	/* Query the kernel needed for this operation. */ \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+\
+	/* Handle dense and upper/lower storage cases separately. */ \
+	if ( bli_is_dense( uplox_eff ) ) \
+	{ \
+		for ( j = 0; j < n_iter; ++j ) \
+		{ \
+			n_elem = n_elem_max; \
+\
+			x1     = x + (j  )*ldx + (0  )*incx; \
+			y1     = y + (j  )*ldy + (0  )*incy; \
+\
+			/* Invoke the kernel with the appropriate parameters. */ \
+			f( \
+			   conjx, \
+			   n_elem, \
+			   x1, incx, \
+			   beta, \
+			   y1, incy, \
+			   cntx  \
+			 ); \
+		} \
+	} \
+	else \
+	{ \
+		if ( bli_is_upper( uplox_eff ) ) \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
+\
+				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
+				y1     = y + (ij0+j  )*ldy + (0  )*incy; \
+\
+				/* Invoke the kernel with the appropriate parameters. */ \
+				f( \
+				   conjx, \
+				   n_elem, \
+				   x1, incx, \
+				   beta, \
+				   y1, incy, \
+				   cntx  \
+				 ); \
+			} \
+		} \
+		else if ( bli_is_lower( uplox_eff ) ) \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
+				n_elem = n_elem_max - i; \
+\
+				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
+				y1     = y + (j  )*ldy + (ij0+i  )*incy; \
+\
+				/* Invoke the kernel with the appropriate parameters. */ \
+				f( \
+				   conjx, \
+				   n_elem, \
+				   x1, incx, \
+				   beta, \
+				   y1, incy, \
+				   cntx  \
+				 ); \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( xpbym_unb_var1,  xpbyv,  BLIS_XPBYV_KER )
+
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
+\
+void PASTEMAC2(chx,chy,opname) \
+     ( \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       trans_t  transx, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       ctype_y* beta, \
+       ctype_y* y, inc_t rs_y, inc_t cs_y, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
+     ) \
+{ \
+	ctype_x* restrict x1; \
+	ctype_y* restrict y1; \
+	uplo_t            uplox_eff; \
+	dim_t             n_iter; \
+	dim_t             n_elem, n_elem_max; \
+	inc_t             ldx, incx; \
+	inc_t             ldy, incy; \
+	dim_t             j, i; \
+	dim_t             ij0, n_shift; \
+\
+	/* Set various loop parameters. */ \
+	bli_set_dims_incs_uplo_2m \
+	( \
+	  diagoffx, diagx, transx, \
+	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
+	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
+	  &ij0, &n_shift \
+	); \
+\
+	/* Extract the conjugation component from the transx parameter. */ \
+	/*conjx = bli_extract_conj( transx );*/ \
+\
+	/* Handle dense and upper/lower storage cases separately. */ \
+	if ( PASTEMAC(chy,eq1)( *beta ) ) \
+	{ \
+		if ( incx == 1 && incy == 1 ) \
+		{ \
+			n_elem = n_elem_max; \
+\
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				x1     = x + (j  )*ldx + (0  )*incx; \
+				y1     = y + (j  )*ldy + (0  )*incy; \
+\
+				ctype_x* restrict chi1 = x1; \
+				ctype_y* restrict psi1 = y1; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(chx,chy,adds)( chi1[i], psi1[i] ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			n_elem = n_elem_max; \
+\
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				x1     = x + (j  )*ldx + (0  )*incx; \
+				y1     = y + (j  )*ldy + (0  )*incy; \
+\
+				ctype_x* restrict chi1 = x1; \
+				ctype_y* restrict psi1 = y1; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \
+\
+					chi1 += incx; \
+					psi1 += incy; \
+				} \
+			} \
+		} \
+	} \
+	else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \
+	{ \
+		if ( incx == 1 && incy == 1 ) \
+		{ \
+			n_elem = n_elem_max; \
+\
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				x1     = x + (j  )*ldx + (0  )*incx; \
+				y1     = y + (j  )*ldy + (0  )*incy; \
+\
+				ctype_x* restrict chi1 = x1; \
+				ctype_y* restrict psi1 = y1; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC3(chx,chy,chy,xpbys)( chi1[i], *beta, psi1[i] ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			n_elem = n_elem_max; \
+\
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				x1     = x + (j  )*ldx + (0  )*incx; \
+				y1     = y + (j  )*ldy + (0  )*incy; \
+\
+				ctype_x* restrict chi1 = x1; \
+				ctype_y* restrict psi1 = y1; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
+\
+					chi1 += incx; \
+					psi1 += incy; \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 )
+INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 )
+
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index b42e1035a..f6014f6b4 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -101,3 +101,46 @@ void PASTEMAC2(ch,opname,_unb_var1) \
 INSERT_GENTPROT_BASIC0( scalm )
 INSERT_GENTPROT_BASIC0( setm )
 
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, opname ) \
+\
+void PASTEMAC2(ch,opname,_unb_var1) \
+     ( \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  beta, \
+       ctype*  y, inc_t rs_y, inc_t cs_y, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
+     );
+
+INSERT_GENTPROT_BASIC0( xpbym )
+
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
+\
+void PASTEMAC3(chx,chy,opname,_unb_var1) \
+     ( \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       trans_t  transx, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       ctype_y* beta, \
+       ctype_y* y, inc_t rs_y, inc_t cs_y, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
+     );
+
+INSERT_GENTPROT2_BASIC0( xpbym_md )
+INSERT_GENTPROT2_MIXDP0( xpbym_md )
+
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index a336cf9f2..6c88ea893 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -55,3 +55,8 @@
 #include "bli_packm_cxk_rih.h"
 #include "bli_packm_cxk_1er.h"
 
+// Mixed datatype support.
+#ifdef BLIS_ENABLE_GEMM_MD
+#include "bli_packm_md.h"
+#endif
+
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 383462726..195315886 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -108,7 +108,17 @@ void bli_packm_blk_var1
        thrinfo_t* t
      )
 {
-	num_t     dt_cp      = bli_obj_dt( c );
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Call a different packm implementation when the storage and target
+	// datatypes differ.
+	if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) )
+	{
+		bli_packm_blk_var1_md( c, p, cntx, cntl, t );
+		return;
+	}
+#endif
+
+	num_t     dt_c       = bli_obj_dt( c );
 
 	struc_t   strucc     = bli_obj_struc( c );
 	doff_t    diagoffc   = bli_obj_diag_offset( c );
@@ -155,7 +165,7 @@ void bli_packm_blk_var1
 		// higher-level operation. Thus, we use BLIS_ONE for kappa so
 		// that the underlying packm implementation does not perform
 		// any scaling during packing.
-		buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
+		buf_kappa = bli_obj_buffer_for_const( dt_c, &BLIS_ONE );
 	}
 	else // if ( bli_is_ind_packed( schema ) )
 	{
@@ -187,11 +197,10 @@ void bli_packm_blk_var1
 		}
 	
 		// Acquire the buffer to the kappa chosen above.
-		buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
+		buf_kappa = bli_obj_buffer_for_1x1( dt_c, kappa_p );
 	}
 
 
-	// Choose the correct func_t object based on the pack_t schema.
 #if 0
 	if      ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
 	else if ( bli_is_3mi_packed( schema ) ||
@@ -208,7 +217,7 @@ void bli_packm_blk_var1
 
 	//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
 
-	//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
+	//if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) )
 	{
 		// If the packm structure-aware kernel func_t in the context is
 		// NULL (which is the default value after the context is created),
@@ -230,11 +239,11 @@ void bli_packm_blk_var1
 #endif
 
 	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_get_dt( dt_cp, packm_kers );
+	packm_ker = bli_func_get_dt( dt_c, packm_kers );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_cp];
+	f = ftypes[dt_c];
 
 	// Invoke the function.
 	f( strucc,
@@ -433,10 +442,10 @@ void PASTEMAC(ch,varname) \
 \
 /*
 if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", m, n, \
                       c_cast,        rs_c, cs_c, "%4.1f", "" ); \
 if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
                       c_cast,        rs_c, cs_c, "%4.1f", "" ); \
 */ \
 \
@@ -605,6 +614,15 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
 		} \
 \
 /*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
+                               p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+/*
 if ( col_stored ) { \
 	if ( bli_thread_work_id( thread ) == 0 ) \
 	{ \
diff --git a/frame/1m/packm/bli_packm_blk_var1.c.old b/frame/1m/packm/bli_packm_blk_var1.c.old
deleted file mode 100644
index 4b18302f4..000000000
--- a/frame/1m/packm/bli_packm_blk_var1.c.old
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
-      of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T packm_fp
-
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           pack_t  schema,
-                           bool_t  invdiag,
-                           bool_t  revifup,
-                           bool_t  reviflo,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      inc_t is_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   packm_ker,
-                           packm_thrinfo_t* thread
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
-
-extern func_t* packm_struc_cxk_kers;
-
-
-void bli_packm_blk_var1( obj_t*   c,
-                         obj_t*   p,
-                         packm_thrinfo_t* t )
-{
-	num_t     dt_cp      = bli_obj_dt( c );
-
-	struc_t   strucc     = bli_obj_struc( c );
-	doff_t    diagoffc   = bli_obj_diag_offset( c );
-	diag_t    diagc      = bli_obj_diag( c );
-	uplo_t    uploc      = bli_obj_uplo( c );
-	trans_t   transc     = bli_obj_conjtrans_status( c );
-	pack_t    schema     = bli_obj_pack_schema( p );
-	bool_t    invdiag    = bli_obj_has_inverted_diag( p );
-	bool_t    revifup    = bli_obj_is_pack_rev_if_upper( p );
-	bool_t    reviflo    = bli_obj_is_pack_rev_if_lower( p );
-
-	dim_t     m_p        = bli_obj_length( p );
-	dim_t     n_p        = bli_obj_width( p );
-	dim_t     m_max_p    = bli_obj_padded_length( p );
-	dim_t     n_max_p    = bli_obj_padded_width( p );
-
-	void*     buf_c      = bli_obj_buffer_at_off( c );
-	inc_t     rs_c       = bli_obj_row_stride( c );
-	inc_t     cs_c       = bli_obj_col_stride( c );
-
-	void*     buf_p      = bli_obj_buffer_at_off( p );
-	inc_t     rs_p       = bli_obj_row_stride( p );
-	inc_t     cs_p       = bli_obj_col_stride( p );
-	inc_t     is_p       = bli_obj_imag_stride( p );
-	dim_t     pd_p       = bli_obj_panel_dim( p );
-	inc_t     ps_p       = bli_obj_panel_stride( p );
-
-	void*     buf_kappa;
-
-	func_t*   packm_kers;
-	void*     packm_ker;
-
-	FUNCPTR_T f;
-
-	// This variant assumes that the micro-kernel will always apply the
-	// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
-	// for kappa so that the underlying packm implementation does not
-	// scale during packing.
-	buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
-
-	// Choose the correct func_t object.
-	packm_kers = packm_struc_cxk_kers;
-
-	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker = bli_func_obj_query( dt_cp, packm_kers );
-
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_cp];
-
-	// Invoke the function.
-	f( strucc,
-	   diagoffc,
-	   diagc,
-	   uploc,
-	   transc,
-	   schema,
-	   invdiag,
-	   revifup,
-	   reviflo,
-	   m_p,
-	   n_p,
-	   m_max_p,
-	   n_max_p,
-	   buf_kappa,
-	   buf_c, rs_c, cs_c,
-	   buf_p, rs_p, cs_p,
-	          is_p,
-	          pd_p, ps_p,
-	   packm_ker,
-	   t );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kertype ) \
-\
-void PASTEMAC(ch,varname) \
-                           struc_t strucc, \
-                           doff_t  diagoffc, \
-                           diag_t  diagc, \
-                           uplo_t  uploc, \
-                           trans_t transc, \
-                           pack_t  schema, \
-                           bool_t  invdiag, \
-                           bool_t  revifup, \
-                           bool_t  reviflo, \
-                           dim_t   m, \
-                           dim_t   n, \
-                           dim_t   m_max, \
-                           dim_t   n_max, \
-                           void*   kappa, \
-                           void*   c, inc_t rs_c, inc_t cs_c, \
-                           void*   p, inc_t rs_p, inc_t cs_p, \
-                                      inc_t is_p, \
-                                      dim_t pd_p, inc_t ps_p, \
-                           void*   packm_ker, \
-                           packm_thrinfo_t* thread \
-                         ) \
-{ \
-	PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
-\
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
-\
-	dim_t           iter_dim; \
-	dim_t           num_iter; \
-	dim_t           it, ic, ip; \
-	dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-	doff_t          diagoffc_i; \
-	doff_t          diagoffc_inc; \
-	dim_t           panel_len_full; \
-	dim_t           panel_len_i; \
-	dim_t           panel_len_max; \
-	dim_t           panel_len_max_i; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	dim_t           panel_off_i; \
-	inc_t           vs_c; \
-	inc_t           ldc; \
-	inc_t           ldp, p_inc; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-	dim_t*          m_panel_use; \
-	dim_t*          n_panel_use; \
-	dim_t*          m_panel_max; \
-	dim_t*          n_panel_max; \
-	conj_t          conjc; \
-	bool_t          row_stored; \
-	bool_t          col_stored; \
-\
-	ctype* restrict c_use; \
-	ctype* restrict p_use; \
-	doff_t          diagoffp_i; \
-\
-\
-	/* If C is zeros and part of a triangular matrix, then we don't need
-	   to pack it. */ \
-	if ( bli_is_zeros( uploc ) && \
-	     bli_is_triangular( strucc ) ) return; \
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
-\
-	/* If c needs a transposition, induce it so that we can more simply
-	   express the remaining parameters and code. */ \
-	if ( bli_does_trans( transc ) ) \
-	{ \
-		bli_swap_incs( &rs_c, &cs_c ); \
-		bli_negate_diag_offset( &diagoffc ); \
-		bli_toggle_uplo( &uploc ); \
-		bli_toggle_trans( &transc ); \
-	} \
-\
-	/* Create flags to incidate row or column storage. Note that the
-	   schema bit that encodes row or column is describing the form of
-	   micro-panel, not the storage in the micro-panel. Hence the
-	   mismatch in "row" and "column" semantics. */ \
-	row_stored = bli_is_col_packed( schema ); \
-	col_stored = bli_is_row_packed( schema ); \
-\
-	/* If the row storage flag indicates row storage, then we are packing
-	   to column panels; otherwise, if the strides indicate column storage,
-	   we are packing to row panels. */ \
-	if ( row_stored ) \
-	{ \
-		/* Prepare to pack to row-stored column panels. */ \
-		iter_dim       = n; \
-		panel_len_full = m; \
-		panel_len_max  = m_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = rs_c; \
-		vs_c           = cs_c; \
-		diagoffc_inc   = -( doff_t )panel_dim_max; \
-		ldp            = rs_p; \
-		m_panel_full   = &m; \
-		n_panel_full   = &panel_dim_i; \
-		m_panel_use    = &panel_len_i; \
-		n_panel_use    = &panel_dim_i; \
-		m_panel_max    = &panel_len_max_i; \
-		n_panel_max    = &panel_dim_max; \
-	} \
-	else /* if ( col_stored ) */ \
-	{ \
-		/* Prepare to pack to column-stored row panels. */ \
-		iter_dim       = m; \
-		panel_len_full = n; \
-		panel_len_max  = n_max; \
-		panel_dim_max  = pd_p; \
-		ldc            = cs_c; \
-		vs_c           = rs_c; \
-		diagoffc_inc   = ( doff_t )panel_dim_max; \
-		ldp            = cs_p; \
-		m_panel_full   = &panel_dim_i; \
-		n_panel_full   = &n; \
-		m_panel_use    = &panel_dim_i; \
-		n_panel_use    = &panel_len_i; \
-		m_panel_max    = &panel_dim_max; \
-		n_panel_max    = &panel_len_max_i; \
-	} \
-\
-	/* Compute the total number of iterations we'll need. */ \
-	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
-\
-	/* Set the initial values and increments for indices related to C and P
-	   based on whether reverse iteration was requested. */ \
-	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
-	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
-	{ \
-		ic0    = (num_iter - 1) * panel_dim_max; \
-		ic_inc = -panel_dim_max; \
-		ip0    = num_iter - 1; \
-		ip_inc = -1; \
-	} \
-	else \
-	{ \
-		ic0    = 0; \
-		ic_inc = panel_dim_max; \
-		ip0    = 0; \
-		ip_inc = 1; \
-	} \
-\
-	p_begin = p_cast; \
-\
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < num_iter; \
-	      ic += ic_inc, ip += ip_inc, it += 1 ) \
-	{ \
-		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
-\
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-\
-		if ( bli_is_triangular( strucc ) &&  \
-		     bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is completely unstored (ie: zero). If the panel
-			   is unstored, we do nothing. (Notice that we don't even
-			   increment p_begin.) */ \
-\
-			continue; \
-		} \
-		else if ( bli_is_triangular( strucc ) &&  \
-		          bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
-		{ \
-			/* This case executes if the panel belongs to a triangular
-			   matrix AND is diagonal-intersecting. Notice that we
-			   cannot bury the following conditional logic into
-			   packm_struc_cxk() because we need to know the value of
-			   panel_len_max_i so we can properly increment p_inc. */ \
-\
-			/* Sanity check. Diagonals should not intersect the short end of
-			   a micro-panel. If they do, then somehow the constraints on
-			   cache blocksizes being a whole multiple of the register
-			   blocksizes was somehow violated. */ \
-			if ( ( col_stored && diagoffc_i < 0 ) || \
-			     ( row_stored && diagoffc_i > 0 ) ) \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-			if      ( ( row_stored && bli_is_upper( uploc ) ) || \
-			          ( col_stored && bli_is_lower( uploc ) ) )  \
-			{ \
-				panel_off_i     = 0; \
-				panel_len_i     = bli_abs( diagoffc_i ) + panel_dim_i; \
-				panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
-				                           panel_len_max ); \
-				diagoffp_i      = diagoffc_i; \
-			} \
-			else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
-			             ( col_stored && bli_is_upper( uploc ) ) )  */ \
-			{ \
-				panel_off_i     = bli_abs( diagoffc_i ); \
-				panel_len_i     = panel_len_full - panel_off_i; \
-				panel_len_max_i = panel_len_max  - panel_off_i; \
-				diagoffp_i      = 0; \
-			} \
-\
-			c_use = c_begin + (panel_off_i  )*ldc; \
-			p_use = p_begin; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffp_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_use, rs_c, cs_c, \
-				                p_use, rs_p, cs_p, \
-				                       is_p ); \
-			} \
-\
-			/* NOTE: This value is usually LESS than ps_p because triangular
-			   matrices usually have several micro-panels that are shorter
-			   than a "full" micro-panel. */ \
-			p_inc = ldp * panel_len_max_i; \
-\
-			/* We nudge the panel increment up by one if it is odd. */ \
-			p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
-		} \
-		else if ( bli_is_herm_or_symm( strucc ) ) \
-		{ \
-			/* This case executes if the panel belongs to a Hermitian or
-			   symmetric matrix, which includes stored, unstored, and
-			   diagonal-intersecting panels. */ \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( strucc, \
-				                diagoffc_i, \
-				                diagc, \
-				                uploc, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_begin, rs_c, cs_c, \
-				                p_begin, rs_p, cs_p, \
-				                         is_p ); \
-			} \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			/*p_inc = ldp * panel_len_max_i;*/ \
-			p_inc = ps_p; \
-		} \
-		else \
-		{ \
-			/* This case executes if the panel is general, or, if the
-			   panel is part of a triangular matrix and is neither unstored
-			   (ie: zero) nor diagonal-intersecting. */ \
-\
-			panel_len_i     = panel_len_full; \
-			panel_len_max_i = panel_len_max; \
-\
-			if( packm_thread_my_iter( it, thread ) ) \
-			{ \
-				packm_ker_cast( BLIS_GENERAL, \
-				                0, \
-				                diagc, \
-				                BLIS_DENSE, \
-				                conjc, \
-				                schema, \
-				                invdiag, \
-				                *m_panel_use, \
-				                *n_panel_use, \
-				                *m_panel_max, \
-				                *n_panel_max, \
-				                kappa_cast, \
-				                c_begin, rs_c, cs_c, \
-				                p_begin, rs_p, cs_p, \
-				                         is_p ); \
-			} \
-/*
-			if ( row_stored ) \
-			PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
-			                      p_begin, rs_p, cs_p, "%9.2e", "" ); \
-			else if ( col_stored ) \
-			PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
-			                      p_begin, rs_p, cs_p, "%9.2e", "" ); \
-*/ \
-\
-			/* NOTE: This value is equivalent to ps_p. */ \
-			/*p_inc = ldp * panel_len_max_i;*/ \
-			p_inc = ps_p; \
-		} \
-\
-\
-		p_begin += p_inc; \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
-
diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c
new file mode 100644
index 000000000..4efd0074c
--- /dev/null
+++ b/frame/1m/packm/bli_packm_blk_var1_md.c
@@ -0,0 +1,293 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_GEMM_MD
+
+#define FUNCPTR_T packm_fp
+
+typedef void (*FUNCPTR_T)(
+                           trans_t transc,
+                           pack_t  schema,
+                           dim_t   m,
+                           dim_t   n,
+                           dim_t   m_max,
+                           dim_t   n_max,
+                           void*   kappa,
+                           void*   c, inc_t rs_c, inc_t cs_c,
+                           void*   p, inc_t rs_p, inc_t cs_p,
+                                      inc_t is_p,
+                                      dim_t pd_p, inc_t ps_p,
+                           cntx_t* cntx,
+                           thrinfo_t* thread
+                         );
+
+static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
+
+
+void bli_packm_blk_var1_md
+     (
+       obj_t*   c,
+       obj_t*   p,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       thrinfo_t* t
+     )
+{
+	num_t     dt_c       = bli_obj_dt( c );
+	num_t     dt_p       = bli_obj_dt( p );
+
+	trans_t   transc     = bli_obj_conjtrans_status( c );
+	pack_t    schema     = bli_obj_pack_schema( p );
+
+	dim_t     m_p        = bli_obj_length( p );
+	dim_t     n_p        = bli_obj_width( p );
+	dim_t     m_max_p    = bli_obj_padded_length( p );
+	dim_t     n_max_p    = bli_obj_padded_width( p );
+
+	void*     buf_c      = bli_obj_buffer_at_off( c );
+	inc_t     rs_c       = bli_obj_row_stride( c );
+	inc_t     cs_c       = bli_obj_col_stride( c );
+
+	void*     buf_p      = bli_obj_buffer_at_off( p );
+	inc_t     rs_p       = bli_obj_row_stride( p );
+	inc_t     cs_p       = bli_obj_col_stride( p );
+	inc_t     is_p       = bli_obj_imag_stride( p );
+	dim_t     pd_p       = bli_obj_panel_dim( p );
+	inc_t     ps_p       = bli_obj_panel_stride( p );
+
+	void*     buf_kappa;
+
+	FUNCPTR_T f;
+
+
+	// Unconditionally use kappa = 1.0. Thus, we don't support scaling
+	// during packing when mixing datatypes.
+	buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_c][dt_p];
+
+	// Invoke the function.
+	f(
+	   transc,
+	   schema,
+	   m_p,
+	   n_p,
+	   m_max_p,
+	   n_max_p,
+	   buf_kappa,
+	   buf_c, rs_c, cs_c,
+	   buf_p, rs_p, cs_p,
+	          is_p,
+	          pd_p, ps_p,
+	   cntx,
+	   t );
+}
+
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
+\
+void PASTEMAC2(chc,chp,varname) \
+     ( \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+                  inc_t is_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	ctype_p* restrict kappa_cast = kappa; \
+	ctype_c* restrict c_cast     = c; \
+	ctype_p* restrict p_cast     = p; \
+	ctype_c* restrict c_begin; \
+	ctype_p* restrict p_begin; \
+\
+	dim_t             iter_dim; \
+	dim_t             num_iter; \
+	dim_t             it, ic, ip; \
+	doff_t            ic_inc, ip_inc; \
+	dim_t             panel_len_full; \
+	dim_t             panel_len_i; \
+	dim_t             panel_len_max; \
+	dim_t             panel_len_max_i; \
+	dim_t             panel_dim_i; \
+	dim_t             panel_dim_max; \
+	inc_t             vs_c; \
+	inc_t             p_inc; \
+	dim_t*            m_panel_use; \
+	dim_t*            n_panel_use; \
+	dim_t*            m_panel_max; \
+	dim_t*            n_panel_max; \
+	conj_t            conjc; \
+	bool_t            row_stored; \
+	bool_t            col_stored; \
+\
+	ctype_c* restrict c_use; \
+	ctype_p* restrict p_use; \
+\
+\
+	/* Extract the conjugation bit from the transposition argument. */ \
+	conjc = bli_extract_conj( transc ); \
+\
+	/* If c needs a transposition, induce it so that we can more simply
+	   express the remaining parameters and code. */ \
+	if ( bli_does_trans( transc ) ) \
+	{ \
+		bli_swap_incs( &rs_c, &cs_c ); \
+		bli_toggle_trans( &transc ); \
+	} \
+\
+	/* Create flags to incidate row or column storage. Note that the
+	   schema bit that encodes row or column is describing the form of
+	   micro-panel, not the storage in the micro-panel. Hence the
+	   mismatch in "row" and "column" semantics. */ \
+	row_stored = bli_is_col_packed( schema ); \
+	col_stored = bli_is_row_packed( schema ); \
+\
+	( void )col_stored; \
+\
+	/* If the row storage flag indicates row storage, then we are packing
+	   to column panels; otherwise, if the strides indicate column storage,
+	   we are packing to row panels. */ \
+	if ( row_stored ) \
+	{ \
+		/* Prepare to pack to row-stored column panels. */ \
+		iter_dim       = n; \
+		panel_len_full = m; \
+		panel_len_max  = m_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = cs_c; \
+		m_panel_use    = &panel_len_i; \
+		n_panel_use    = &panel_dim_i; \
+		m_panel_max    = &panel_len_max_i; \
+		n_panel_max    = &panel_dim_max; \
+	} \
+	else /* if ( col_stored ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panels. */ \
+		iter_dim       = m; \
+		panel_len_full = n; \
+		panel_len_max  = n_max; \
+		panel_dim_max  = pd_p; \
+		vs_c           = rs_c; \
+		m_panel_use    = &panel_dim_i; \
+		n_panel_use    = &panel_len_i; \
+		m_panel_max    = &panel_dim_max; \
+		n_panel_max    = &panel_len_max_i; \
+	} \
+\
+	/* Compute the total number of iterations we'll need. */ \
+	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+\
+	{ \
+		ic_inc = panel_dim_max; \
+		ip_inc = 1; \
+	} \
+\
+	p_begin = p_cast; \
+\
+/*
+if ( row_stored ) \
+PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: b orig", m, n, \
+                       c_cast,        rs_c, cs_c, "%5.2f", "" ); \
+if ( col_stored ) \
+PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \
+                       c_cast,        rs_c, cs_c, "%5.2f", "" ); \
+*/ \
+\
+	for ( ic  = 0,      ip  = 0,      it  = 0; it < num_iter; \
+	      ic += ic_inc, ip += ip_inc, it += 1 ) \
+	{ \
+		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
+\
+		c_begin     = c_cast + (ic  )*vs_c; \
+\
+		{ \
+			c_use = c_begin; \
+			p_use = p_begin; \
+\
+			panel_len_i     = panel_len_full; \
+			panel_len_max_i = panel_len_max; \
+\
+			if( packm_thread_my_iter( it, thread ) ) \
+			{ \
+				PASTEMAC2(chc,chp,packm_struc_cxk_md) \
+				( \
+				  conjc, \
+				  schema, \
+				  *m_panel_use, \
+				  *n_panel_use, \
+				  *m_panel_max, \
+				  *n_panel_max, \
+				  kappa_cast, \
+				  c_use, rs_c, cs_c, \
+				  p_use, rs_p, cs_p, \
+			             is_p, \
+				  cntx \
+				); \
+			} \
+\
+			p_inc = ps_p; \
+		} \
+\
+/*
+if ( row_stored ) \
+PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
+                                p_use, rs_p, cs_p, "%5.2f", "" ); \
+else \
+PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
+                                p_use, rs_p, cs_p, "%5.2f", "" ); \
+*/ \
+\
+		p_begin += p_inc; \
+\
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
+INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
+
+#endif
diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h
new file mode 100644
index 000000000..519749143
--- /dev/null
+++ b/frame/1m/packm/bli_packm_blk_var1_md.h
@@ -0,0 +1,67 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_packm_blk_var1_md
+     (
+       obj_t*   c,
+       obj_t*   p,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       thrinfo_t* t
+     );
+
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
+\
+void PASTEMAC2(chc,chp,varname) \
+     ( \
+       trans_t transc, \
+       pack_t  schema, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+                  inc_t is_p, \
+                  dim_t pd_p, inc_t ps_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
+     );
+
+INSERT_GENTPROT2_BASIC0( packm_blk_var1_md )
+INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md )
+
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 0afd06e27..4ecef849f 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -211,13 +211,14 @@ siz_t bli_packm_init_pack
 	bli_init_once();
 
 	num_t     dt           = bli_obj_dt( a );
+	num_t     dt_tar       = bli_obj_target_dt( a );
 	trans_t   transa       = bli_obj_onlytrans_status( a );
 	dim_t     m_a          = bli_obj_length( a );
 	dim_t     n_a          = bli_obj_width( a );
-	dim_t     bmult_m_def  = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
-	dim_t     bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
-	dim_t     bmult_n_def  = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx );
-	dim_t     bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx );
+	dim_t     bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
+	dim_t     bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
+	dim_t     bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+	dim_t     bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx );
 
 	dim_t     m_p, n_p;
 	dim_t     m_p_pad, n_p_pad;
@@ -230,6 +231,17 @@ siz_t bli_packm_init_pack
 	// We begin by copying the fields of A.
 	bli_obj_alias_to( a, p );
 
+	// Typecast the internal scalar value to the target datatype.
+	// NOTE: This must happen BEFORE we change the datatype of P to reflect
+	// the target_dt.
+	if ( dt != dt_tar )
+	{
+		bli_obj_scalar_cast_to( dt_tar, p );
+	}
+
+	// Update the datatype of P to be the target datatype of A.
+	bli_obj_set_dt( dt_tar, p );
+
 	// Update the dimension fields to explicitly reflect a transposition,
 	// if needed.
 	// Then, clear the conjugation and transposition fields from the object
diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_md.h
new file mode 100644
index 000000000..7620a572e
--- /dev/null
+++ b/frame/1m/packm/bli_packm_md.h
@@ -0,0 +1,37 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "bli_packm_blk_var1_md.h"
+#include "bli_packm_struc_cxk_md.h"
+
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
new file mode 100644
index 000000000..33d720f30
--- /dev/null
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -0,0 +1,294 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_GEMM_MD
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
+\
+void PASTEMAC2(chc,chp,varname) \
+     ( \
+       conj_t            conjc, \
+       pack_t            schema, \
+       dim_t             m_panel, \
+       dim_t             n_panel, \
+       dim_t             m_panel_max, \
+       dim_t             n_panel_max, \
+       ctype_p* restrict kappa, \
+       ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
+                            inc_t is_p, \
+       cntx_t*           cntx  \
+     ) \
+{ \
+	dim_t  panel_dim; \
+	dim_t  panel_len; \
+	inc_t  incc, ldc; \
+	inc_t        ldp; \
+\
+\
+	/* Determine the dimensions and relative strides of the micro-panel
+	   based on its pack schema. */ \
+	if ( bli_is_col_packed( schema ) ) \
+	{ \
+		/* Prepare to pack to row-stored column panel. */ \
+		panel_dim = n_panel; \
+		panel_len = m_panel; \
+		incc      = cs_c; \
+		ldc       = rs_c; \
+		ldp       = rs_p; \
+	} \
+	else /* if ( bli_is_row_packed( schema ) ) */ \
+	{ \
+		/* Prepare to pack to column-stored row panel. */ \
+		panel_dim = m_panel; \
+		panel_len = n_panel; \
+		incc      = rs_c; \
+		ldc       = cs_c; \
+		ldp       = cs_p; \
+	} \
+\
+\
+	if ( bli_is_nat_packed( schema ) ) \
+	{ \
+		trans_t transc = ( trans_t )conjc; \
+\
+		/* NOTE: We ignore kappa for now, since it should be 1.0. */ \
+		PASTEMAC2(chc,chp,castm) \
+		( \
+		  transc, \
+		  panel_dim, \
+		  panel_len, \
+		  c, incc, ldc, \
+		  p,    1, ldp  \
+		); \
+\
+		/* The packed memory region was acquired/allocated with "aligned"
+		   dimensions (ie: dimensions that were possibly inflated up to a
+		   multiple). When these dimension are inflated, it creates empty
+		   regions along the bottom and/or right edges of the matrix. If
+		   either region exists, we set them to zero. This allows the
+		   micro-kernel to remain simple since it does not need to support
+		   different register blockings for the edge cases. */ \
+		if ( m_panel != m_panel_max ) \
+		{ \
+			ctype_p* restrict zero   = PASTEMAC(chp,0); \
+			dim_t             i      = m_panel; \
+			dim_t             m_edge = m_panel_max - i; \
+			dim_t             n_edge = n_panel_max; \
+			ctype_p*          p_edge = p + (i  )*rs_p; \
+\
+			PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
+			( \
+			  BLIS_NO_CONJUGATE, \
+			  0, \
+			  BLIS_NONUNIT_DIAG, \
+			  BLIS_DENSE, \
+			  m_edge, \
+			  n_edge, \
+			  zero, \
+			  p_edge, rs_p, cs_p, \
+			  cntx, \
+			  NULL  \
+			); \
+		} \
+\
+		if ( n_panel != n_panel_max ) \
+		{ \
+			ctype_p* restrict zero   = PASTEMAC(chp,0); \
+			dim_t             j      = n_panel; \
+			dim_t             m_edge = m_panel_max; \
+			dim_t             n_edge = n_panel_max - j; \
+			ctype_p*          p_edge = p + (j  )*cs_p; \
+\
+			PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
+			( \
+			  BLIS_NO_CONJUGATE, \
+			  0, \
+			  BLIS_NONUNIT_DIAG, \
+			  BLIS_DENSE, \
+			  m_edge, \
+			  n_edge, \
+			  zero, \
+			  p_edge, rs_p, cs_p, \
+			  cntx, \
+			  NULL  \
+			); \
+		} \
+	} \
+	else /* if ( bli_is_1r_packed( schema ) ) */ \
+	{ \
+		/* NOTE: We ignore kappa for now, since it should be 1.0. */ \
+		PASTEMAC2(chc,chp,packm_cxk_1r_md) \
+		( \
+		  conjc, \
+		  panel_dim, \
+		  panel_len, \
+		  c, incc, ldc, \
+		  p,       ldp  \
+		); \
+\
+		if ( m_panel != m_panel_max ) \
+		{ \
+			ctype_p* restrict zero   = PASTEMAC(chp,0); \
+			dim_t             offm   = m_panel; \
+			dim_t             offn   = 0; \
+			dim_t             m_edge = m_panel_max - m_panel; \
+			dim_t             n_edge = n_panel_max; \
+\
+			( void ) zero; \
+			( void ) m_edge; ( void )offm; \
+			( void ) n_edge; ( void )offn; \
+\
+			PASTEMAC(chp,set1ms_mxn) \
+			( \
+			  schema, \
+			  offm, \
+			  offn, \
+			  m_edge, \
+			  n_edge, \
+			  zero, \
+			  p, rs_p, cs_p, ldp  \
+			); \
+		} \
+\
+		if ( n_panel != n_panel_max ) \
+		{ \
+			ctype_p* restrict zero   = PASTEMAC(chp,0); \
+			dim_t             offm   = 0; \
+			dim_t             offn   = n_panel; \
+			dim_t             m_edge = m_panel_max; \
+			dim_t             n_edge = n_panel_max - n_panel; \
+\
+			( void ) zero; \
+			( void ) m_edge; ( void )offm; \
+			( void ) n_edge; ( void )offn; \
+\
+			PASTEMAC(chp,set1ms_mxn) \
+			( \
+			  schema, \
+			  offm, \
+			  offn, \
+			  m_edge, \
+			  n_edge, \
+			  zero, \
+			  p, rs_p, cs_p, ldp  \
+			); \
+		} \
+	} \
+\
+\
+/*
+	if ( bli_is_col_packed( schema ) ) \
+	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
+	                      p, rs_p, cs_p, "%4.1f", "" ); \
+	else if ( bli_is_row_packed( schema ) ) \
+	PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
+	                      p, rs_p, cs_p, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC2_BASIC0( packm_struc_cxk_md )
+INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md )
+
+
+// -----------------------------------------------------------------------------
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \
+\
+void PASTEMAC2(cha,chp,opname) \
+     ( \
+       conj_t            conja, \
+       dim_t             m, \
+       dim_t             n, \
+       ctype_a* restrict a, inc_t inca, inc_t lda, \
+       ctype_p* restrict p,             inc_t ldp  \
+     ) \
+{ \
+	const inc_t                   inca2      = 2 * inca; \
+	const inc_t                   lda2       = 2 * lda; \
+	const inc_t                   ldp2       = 2 * ldp; \
+\
+	PASTEMAC(cha,ctyper)* restrict alpha1_r   = ( PASTEMAC(cha,ctyper)* )a; \
+	PASTEMAC(cha,ctyper)* restrict alpha1_i   = ( PASTEMAC(cha,ctyper)* )a + 1; \
+	PASTEMAC(chp,ctyper)* restrict pi1_r      = ( PASTEMAC(chp,ctyper)* )p; \
+	PASTEMAC(chp,ctyper)* restrict pi1_i      = ( PASTEMAC(chp,ctyper)* )p + ldp; \
+\
+	dim_t i; \
+\
+	if ( bli_is_conj( conja ) ) \
+	{ \
+		for ( ; n != 0; --n ) \
+		{ \
+			for ( i = 0; i < m; ++i ) \
+			{ \
+				PASTEMAC2(cha,chp,copyjris)( *(alpha1_r + i*inca2), \
+				                             *(alpha1_i + i*inca2), \
+				                             *(pi1_r    + i*1), \
+				                             *(pi1_i    + i*1) ); \
+			} \
+\
+			alpha1_r += lda2; \
+			alpha1_i += lda2; \
+			pi1_r    += ldp2; \
+			pi1_i    += ldp2; \
+		} \
+	} \
+	else \
+	{ \
+		for ( ; n != 0; --n ) \
+		{ \
+			for ( i = 0; i < m; ++i ) \
+			{ \
+				PASTEMAC2(cha,chp,copyris)( *(alpha1_r + i*inca2), \
+				                            *(alpha1_i + i*inca2), \
+				                            *(pi1_r    + i*1), \
+				                            *(pi1_i    + i*1) ); \
+			} \
+\
+			alpha1_r += lda2; \
+			alpha1_i += lda2; \
+			pi1_r    += ldp2; \
+			pi1_i    += ldp2; \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC0( packm_cxk_1r_md )
+INSERT_GENTFUNC2_MIXDP0( packm_cxk_1r_md )
+
+#endif
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h
new file mode 100644
index 000000000..a0c56401e
--- /dev/null
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
+\
+void PASTEMAC2(chc,chp,varname) \
+     ( \
+       conj_t            conjc, \
+       pack_t            schema, \
+       dim_t             m_panel, \
+       dim_t             n_panel, \
+       dim_t             m_panel_max, \
+       dim_t             n_panel_max, \
+       ctype_p* restrict kappa, \
+       ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
+                            inc_t is_p, \
+       cntx_t*           cntx  \
+     );
+
+INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
+INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md )
+
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \
+\
+void PASTEMAC2(cha,chp,opname) \
+     ( \
+       conj_t            conja, \
+       dim_t             m, \
+       dim_t             n, \
+       ctype_a* restrict a, inc_t inca, inc_t lda, \
+       ctype_p* restrict p,             inc_t ldp  \
+     );
+
+INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md )
+INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md )
+
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index d1bd9dec2..52895f121 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -295,13 +295,33 @@ void bli_gemm_basic_check
 	e_val = bli_check_level3_dims( a, b, c );
 	bli_check_error_code( e_val );
 
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Skip checking for consistent datatypes between A, B, and C since
+	// that is totally valid for mixed-datatype gemm.
+
+	// When mixing datatypes, make sure that alpha does not have a non-zero
+	// imaginary component.
+	if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
+	     bli_obj_dt( c ) != bli_obj_dt( b ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
+	if ( !bli_obj_imag_is_zero( alpha ) )
+	{
+		bli_print_msg( "Mixed-datatype gemm does not yet support alpha with a non-zero imaginary component. Please contact BLIS developers for further support.", __FILE__, __LINE__ );
+		bli_abort();
+	}
+
+#else // BLIS_DISABLE_GEMM_MD
+
 	// Check for consistent datatypes.
+	// NOTE: We only perform these tests when mixed datatype support is
+	// disabled.
 
 	e_val = bli_check_consistent_object_datatypes( c, a );
 	bli_check_error_code( e_val );
 
 	e_val = bli_check_consistent_object_datatypes( c, b );
 	bli_check_error_code( e_val );
+#endif
 }
 
 void bli_hemm_basic_check
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index 7cfcd0f94..701428a59 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -58,11 +58,13 @@ void PASTEMAC(opname,EX_SUF) \
 	BLIS_OAPI_EX_DECLS \
 \
 	/* Only proceed with an induced method if all operands have the same
-	   (complex) datatype. If any datatypes differ, skip the induced method
-	   chooser function and proceed directly with native execution, which is
+	   (complex) datatype, and if that datatype matches the execution
+	   datatype. If any datatypes differ, skip the induced method chooser
+	   function and proceed directly with native execution, which is
 	   where mixed datatype support will be implemented (if at all). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_dt( b ) == bli_obj_dt( c ) && \
+	if ( bli_obj_dt( c ) == bli_obj_dt( a ) && \
+	     bli_obj_dt( c ) == bli_obj_dt( b ) && \
+	     bli_obj_dt( c ) == bli_obj_exec_dt( c ) && \
 	     bli_obj_is_complex( c ) ) \
 	{ \
 		/* Invoke the operation's "ind" function--its induced method front-end.
diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h
index 5349e9750..987ba93bd 100644
--- a/frame/3/gemm/bli_gemm.h
+++ b/frame/3/gemm/bli_gemm.h
@@ -38,3 +38,7 @@
 
 #include "bli_gemm_var.h"
 
+// Mixed datatype support.
+#ifdef BLIS_ENABLE_GEMM_MD
+#include "bli_gemm_md.h"
+#endif
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 8518d6102..62d1ae502 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -54,8 +54,13 @@ void bli_gemm_front
 	obj_t   c_local;
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
-	gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
-	if ( status == BLIS_SUCCESS ) return;
+	// Only handle small problems separately for homogeneous datatypes.
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_dt( a ) == bli_obj_dt( c ) )
+	{
+		gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
+		if ( status == BLIS_SUCCESS ) return;
+	}
 #endif
 
 	// Check parameters.
@@ -74,38 +79,33 @@ void bli_gemm_front
 	bli_obj_alias_to( b, &b_local );
 	bli_obj_alias_to( c, &c_local );
 
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_swap( &a_local, &b_local );
+#ifdef BLIS_ENABLE_GEMM_MD
+	cntx_t cntx_local;
 
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &c_local );
+	// If any of the storage datatypes differ, or if the computation precision
+	// differs from the storage precision of C, utilize the mixed datatype
+	// code path.
+	// NOTE: If we ever want to support the caller setting the computation
+	// domain explicitly, we will need to check the computation dt against the
+	// storage dt of C (instead of the computation precision against the
+	// storage precision of C).
+	if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
+	     bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
+	     bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
+	{
+		// Handle mixed datatype cases in bli_gemm_md(), which may modify
+		// the objects or the context. (If the context is modified, cntx
+		// is adjusted to point to cntx_local.)
+		bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
 	}
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_GEMM,
-	  BLIS_LEFT, // ignored for gemm/hemm/symm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
+	else // homogeneous datatypes
+#endif
 	{
-		// A sort of hack for communicating the desired pach schemas for A and B
-		// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
+		// A sort of hack for communicating the desired pach schemas for A and
+		// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 		// bli_l3_cntl_create_if()). This allows us to access the schemas from
-		// the control tree, which hopefully reduces some confusion, particularly
-		// in bli_packm_init().
+		// the control tree, which hopefully reduces some confusion,
+		// particularly in bli_packm_init().
 		if ( bli_cntx_method( cntx ) == BLIS_NAT )
 		{
 			bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
@@ -121,6 +121,129 @@ void bli_gemm_front
 		}
 	}
 
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Don't perform the following optimization for ccr or crc cases, as
+	// those cases are sensitive to the ukernel storage preference (ie:
+	// transposing the operation would break them).
+	if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
+	     !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
+#endif
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+
+		// We must also swap the pack schemas, which were set by bli_gemm_md()
+		// or the inlined code above.
+		bli_obj_swap_pack_schemas( &a_local, &b_local );
+	}
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	obj_t* cp    = &c_local;
+	obj_t* betap = beta;
+
+#ifdef BLIS_ENABLE_GEMM_MD
+#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	// If any of the following conditions are met, create a temporary matrix
+	// conformal to C into which we will accumulate the matrix product:
+	// - the storage precision of C differs from the computation precision;
+	// - the domains are mixed as crr;
+	// - the storage format of C does not match the preferred orientation
+	//   of the ccr or crc cases.
+	// Then, after the computation is complete, this matrix will be copied
+	// or accumulated back to C.
+	const bool_t is_ccr_mismatch =
+	             ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
+                   !bli_obj_is_col_stored( &c_local ) );
+	const bool_t is_crc_mismatch =
+	             ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
+                   !bli_obj_is_row_stored( &c_local ) );
+
+	obj_t  ct;
+	bool_t use_ct = FALSE;
+
+	// FGVZ: Consider adding another guard here that only creates and uses a
+	// temporary matrix for accumulation if k < c * kc, where c is some small
+	// constant like 2. And don't forget to use the same conditional for the
+	// castm() and free() at the end.
+	if (
+	     bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
+	     bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
+	     is_ccr_mismatch ||
+	     is_crc_mismatch
+	   )
+	{
+		use_ct = TRUE;
+	}
+
+	// If we need a temporary matrix conformal to C for whatever reason,
+	// we create it and prepare to use it now.
+	if ( use_ct )
+	{
+		const dim_t m     = bli_obj_length( &c_local );
+		const dim_t n     = bli_obj_width( &c_local );
+		      inc_t rs    = bli_obj_row_stride( &c_local );
+		      inc_t cs    = bli_obj_col_stride( &c_local );
+
+		      num_t dt_ct = bli_obj_domain( &c_local ) |
+		                    bli_obj_comp_prec( &c_local );
+
+		// When performing the crr case, accumulate to a contiguously-stored
+		// real matrix so we do not have to repeatedly update C with general
+		// stride.
+		if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
+			dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
+
+		// When performing the mismatched ccr or crc cases, now is the time
+		// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
+		// microkernel can output directly to C (instead of using a temporary
+		// microtile).
+		if      ( is_ccr_mismatch ) { rs = 1; cs = m; }
+		else if ( is_crc_mismatch ) { rs = n; cs = 1; }
+
+		bli_obj_create( dt_ct, m, n, rs, cs, &ct );
+
+		const num_t dt_exec = bli_obj_exec_dt( &c_local );
+		const num_t dt_comp = bli_obj_comp_dt( &c_local );
+
+		bli_obj_set_target_dt( dt_ct, &ct );
+		bli_obj_set_exec_dt( dt_exec, &ct );
+		bli_obj_set_comp_dt( dt_comp, &ct );
+
+		// A naive approach would cast C to the comptuation datatype,
+		// compute with beta, and then cast the result back to the
+		// user-provided output matrix. However, we employ a different
+		// approach that halves the number of memops on C (or its
+		// typecast temporary) by writing the A*B product directly to
+		// temporary storage, and then using xpbym to scale the
+		// output matrix by beta and accumulate/cast the A*B product.
+		//bli_castm( &c_local, &ct );
+		betap = &BLIS_ZERO;
+
+		cp = &ct;
+	}
+#endif
+#endif
+
 	// Invoke the internal back-end via the thread handler.
 	bli_l3_thread_decorator
 	(
@@ -129,11 +252,112 @@ void bli_gemm_front
 	  alpha,
 	  &a_local,
 	  &b_local,
-	  beta,
-	  &c_local,
+	  betap,
+	  cp,
 	  cntx,
 	  rntm,
 	  cntl
 	);
+
+#ifdef BLIS_ENABLE_GEMM_MD
+#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	// If we created a temporary matrix conformal to C for whatever reason,
+	// we copy/accumulate the result back to C and then release the object.
+	if ( use_ct )
+    {
+		//bli_castnzm( &ct, &c_local );
+		bli_xpbym( &ct, beta, &c_local );
+
+		bli_obj_free( &ct );
+	}
+#endif
+#endif
 }
 
+// -----------------------------------------------------------------------------
+
+#if 0
+	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
+	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
+	{
+		const bool_t a_is_real = bli_obj_is_real( a );
+		const bool_t a_is_comp = bli_obj_is_complex( a );
+		const bool_t b_is_real = bli_obj_is_real( b );
+		const bool_t b_is_comp = bli_obj_is_complex( b );
+		const bool_t c_is_real = bli_obj_is_real( c );
+		const bool_t c_is_comp = bli_obj_is_complex( c );
+
+		const bool_t a_is_single = bli_obj_is_single_prec( a );
+		const bool_t a_is_double = bli_obj_is_double_prec( a );
+		const bool_t b_is_single = bli_obj_is_single_prec( b );
+		const bool_t b_is_double = bli_obj_is_double_prec( b );
+		const bool_t c_is_single = bli_obj_is_single_prec( c );
+		const bool_t c_is_double = bli_obj_is_double_prec( c );
+
+		const bool_t comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
+		const bool_t comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
+
+		const bool_t mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
+		                           bli_obj_domain( c ) != bli_obj_domain( b );
+
+		( void )a_is_real; ( void )a_is_comp;
+		( void )b_is_real; ( void )b_is_comp;
+		( void )c_is_real; ( void )c_is_comp;
+		( void )a_is_single; ( void )a_is_double;
+		( void )b_is_single; ( void )b_is_double;
+		( void )c_is_single; ( void )c_is_double;
+		( void )comp_single; ( void )comp_double;
+
+		if (
+		     //( c_is_comp && a_is_comp && b_is_real ) ||
+		     //( c_is_comp && a_is_real && b_is_comp ) ||
+		     //( c_is_real && a_is_comp && b_is_comp ) ||
+		     //( c_is_comp && a_is_real && b_is_real ) ||
+		     //( c_is_real && a_is_comp && b_is_real ) ||
+		     //( c_is_real && a_is_real && b_is_comp ) ||
+		     //FALSE
+		     TRUE
+		   )
+		{
+			if (
+			     ( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
+			     ( c_is_single && a_is_single && b_is_single && comp_single ) ||
+			     ( c_is_single && a_is_single && b_is_single && comp_double ) ||
+			     ( c_is_single && a_is_single && b_is_double                ) ||
+			     ( c_is_single && a_is_double && b_is_single                ) ||
+			     ( c_is_double && a_is_single && b_is_single                ) ||
+			     ( c_is_single && a_is_double && b_is_double                ) ||
+			     ( c_is_double && a_is_single && b_is_double                ) ||
+			     ( c_is_double && a_is_double && b_is_single                ) ||
+			     ( c_is_double && a_is_double && b_is_double && comp_single ) ||
+			     ( c_is_double && a_is_double && b_is_double && comp_double ) ||
+			     ( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
+			     FALSE
+			   )
+				bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
+			else
+				bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
+		}
+		else
+			bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
+		return;
+	}
+#else
+#if 0
+	// If any of the storage datatypes differ, or if the execution precision
+	// differs from the storage precision of C, utilize the mixed datatype
+	// code path.
+	// NOTE: We could check the exec dt against the storage dt of C, but for
+	// now we don't support the caller setting the execution domain
+	// explicitly.
+	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
+	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
+	{
+		bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
+		return;
+	}
+#endif
+#endif
+
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 1967c6ce4..1aa032ad9 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -69,6 +69,22 @@ void bli_gemm_ker_var2
        thrinfo_t* thread
      )
 {
+#ifdef BLIS_ENABLE_GEMM_MD
+	// By now, A and B have been packed and cast to the execution precision.
+	// In most cases, such as when storage precision of C differs from the
+	// execution precision, we utilize the mixed datatype code path. However,
+	// a few cases still fall within this kernel, such as mixed domain with
+	// equal precision (ccr, crc, rcc), hence those expressions being disabled
+	// in the conditional below.
+	if ( //( bli_obj_domain( c ) != bli_obj_domain( a ) ) ||
+	     //( bli_obj_domain( c ) != bli_obj_domain( b ) ) ||
+	     ( bli_obj_dt( c ) != bli_obj_exec_dt( c ) ) )
+	{
+		bli_gemm_ker_var2_md( a, b, c, cntx, rntm, cntl, thread );
+		return;
+	}
+#endif
+
 	num_t     dt_exec   = bli_obj_exec_dt( c );
 
 	pack_t    schema_a  = bli_obj_pack_schema( a );
@@ -112,12 +128,12 @@ void bli_gemm_ker_var2
 	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-    // If 1m is being employed on a column- or row-stored matrix with a
-    // real-valued beta, we can use the real domain macro-kernel, which
+	// If 1m is being employed on a column- or row-stored matrix with a
+	// real-valued beta, we can use the real domain macro-kernel, which
 	// eliminates a little overhead associated with the 1m virtual
 	// micro-kernel.
 #if 1
-	if ( bli_is_1m_packed( schema_a ) )
+	if ( bli_cntx_method( cntx ) == BLIS_1M )
 	{
 		bli_l3_ind_recast_1m_params
 		(
@@ -132,6 +148,22 @@ void bli_gemm_ker_var2
 	}
 #endif
 
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Tweak parameters in select mixed domain cases cases.
+	bli_gemm_md_ker_var2_recast
+	(
+	  &dt_exec,
+	  bli_obj_dt( a ),
+	  bli_obj_dt( b ),
+	  bli_obj_dt( c ),
+	  &m, &n, &k,
+	  &pd_a, &ps_a,
+	  &pd_b, &ps_b,
+	  c,
+	  &rs_c, &cs_c
+	);
+#endif
+
 	// Index into the type combination array to extract the correct
 	// function pointer.
 	f = ftypes[dt_exec];
@@ -267,6 +299,9 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
 	dim_t jr_num_threads = bli_thread_n_way( thread ); \
diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c
new file mode 100644
index 000000000..e414722b9
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_ker_var2_md.c
@@ -0,0 +1,405 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_GEMM_MD
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY2_ALL(ftypes,gemm_ker_var2_md);
+
+
+void bli_gemm_ker_var2_md
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+	num_t     dt_c      = bli_obj_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	// NOTE: We know that the internal scalars of A and B are already of the
+	// target datatypes because the necessary typecasting would have already
+	// taken place during bli_packm_init().
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	// NOTE: We know that scalar_b is of type dt_exec due to the above code
+	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
+	// and we know that the internal scalar in C is already of the type dt_c
+	// due to the casting in the implementation of bli_obj_scalar_attach().
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Tweak parameters in select mixed domain cases cases.
+	bli_gemm_md_ker_var2_recast
+	(
+	  &dt_exec,
+	  bli_obj_dt( a ),
+	  bli_obj_dt( b ),
+	  bli_obj_dt( c ),
+	  &m, &n, &k,
+	  &pd_a, &ps_a,
+	  &pd_b, &ps_b,
+	  c,
+	  &rs_c, &cs_c
+	);
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_c][dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_c, ctype_e, chc, che, varname ) \
+\
+void PASTEMAC2(chc,che,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dte        = PASTEMAC(che,type); \
+	/*const num_t     dtc        = PASTEMAC(chc,type);*/ \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(che,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dte, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype_e         ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype_e ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dte, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype_e* restrict zero       = PASTEMAC(che,0); \
+	ctype_e* restrict a_cast     = a; \
+	ctype_e* restrict b_cast     = b; \
+	ctype_c* restrict c_cast     = c; \
+	ctype_e* restrict alpha_cast = alpha; \
+	ctype_c* restrict beta_cast  = beta; \
+	ctype_e* restrict b1; \
+	ctype_c* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(che,set0s_mxn)( MR, NR, \
+	                         ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype_e* restrict a1; \
+		ctype_c* restrict c11; \
+		ctype_e* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype_e* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Always save the micropanel product to the local microtile and
+			   then accumulate it into C via the xpbys_mxn macro. */ \
+			/*if ( 1 )*/ \
+			{ \
+				/*bli_auxinfo_set_dt_on_output( dte, &aux );*/ \
+\
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the microtile of C and add the result from above. */ \
+				PASTEMAC3(che,chc,chc,xpbys_mxn) \
+				( \
+				  m_cur, n_cur, \
+				  ct,  rs_ct, cs_ct, \
+				  beta_cast, \
+				  c11, rs_c,  cs_c \
+				); \
+			} \
+/*
+			else if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				bli_auxinfo_set_dt_on_output( dtc, &aux ); \
+\
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  ( ctype_e* )beta_cast, \
+				  ( ctype_e* )c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				bli_auxinfo_set_dt_on_output( dte, &aux ); \
+\
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				PASTEMAC3(che,chc,chc,xpbys_mxn) \
+				( \
+				  m_cur, n_cur, \
+				  ct,  rs_ct, cs_ct, \
+				  beta_cast, \
+				  c11, rs_c,  cs_c \
+				); \
+			} \
+*/ \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC2_BASIC0( gemm_ker_var2_md )
+INSERT_GENTFUNC2_MIXDP0( gemm_ker_var2_md )
+
+#endif
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
new file mode 100644
index 000000000..9ccb49225
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -0,0 +1,901 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_GEMM_MD
+
+void bli_gemm_md
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	const bool_t a_is_real = bli_obj_is_real( a );
+	const bool_t a_is_comp = bli_obj_is_complex( a );
+	const bool_t b_is_real = bli_obj_is_real( b );
+	const bool_t b_is_comp = bli_obj_is_complex( b );
+	const bool_t c_is_real = bli_obj_is_real( c );
+	const bool_t c_is_comp = bli_obj_is_complex( c );
+
+	if      ( c_is_real && a_is_real && b_is_real )
+	{
+		// C_real += A_real * B_real
+		doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_comp && a_is_comp && b_is_comp )
+	{
+		// C_complex += A_complex * B_complex
+		doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_comp && a_is_comp && b_is_real )
+	{
+		// C_complex += A_complex * B_real
+		doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_comp && a_is_real && b_is_comp )
+	{
+		// C_complex += A_real * B_complex
+		doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_real && a_is_comp && b_is_comp )
+	{
+		// C_real += A_complex * B_complex
+		doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_comp && a_is_real && b_is_real )
+	{
+		// C_complex += A_real * B_real
+		doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_real && a_is_comp && b_is_real )
+	{
+		// C_real += A_complex * B_real
+		doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx );
+	}
+	else if ( c_is_real && a_is_real && b_is_comp )
+	{
+		// C_real += A_real * B_complex
+		doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx );
+	}
+	else
+	{
+		doms.comp = BLIS_REAL;
+		doms.exec = BLIS_REAL;
+
+		// This should never execute.
+		bli_abort();
+	}
+
+	// Extract the computation and execution domains from the struct
+	// returned above.
+	dom_t dom_comp = doms.comp;
+	dom_t dom_exec = doms.exec;
+
+	// Inspect the computation precision of C. (The user may have set
+	// this explicitly to request the precision in which the computation
+	// should take place.)
+	prec_t prec_comp = bli_obj_comp_prec( c );
+
+	// The computation precision tells us the target precision of A and B.
+	// NOTE: We don't set the target domain here. The target domain would
+	// either be unchanged, or would have been changed in one of the eight
+	// domain cases above.
+	bli_obj_set_target_prec( prec_comp, a );
+	bli_obj_set_target_prec( prec_comp, b );
+
+	// Combine the execution domain with the computation precision to form
+	// the execution datatype. (The computation precision and execution
+	// precision are always equal.)
+	num_t dt_exec = dom_exec | prec_comp;
+
+	// Set the execution datatypes of A, B, and C.
+	bli_obj_set_exec_dt( dt_exec, a );
+	bli_obj_set_exec_dt( dt_exec, b );
+	bli_obj_set_exec_dt( dt_exec, c );
+
+	// Combine the computation precision and computation domain to form the
+	// computation datatype.
+	num_t dt_comp = dom_comp | prec_comp;
+
+	// Set the computation datatypes of A, B, and C.
+	bli_obj_set_comp_dt( dt_comp, a );
+	bli_obj_set_comp_dt( dt_comp, b );
+	bli_obj_set_comp_dt( dt_comp, c );
+
+
+
+#if 0
+	if ( bli_obj_is_single_prec( c ) ) printf( "%% --> s += " );
+	else                               printf( "%% --> d += " );
+	if ( bli_obj_is_single_prec( a ) ) printf( "s " );
+	else                               printf( "d " );
+	if ( bli_obj_is_single_prec( b ) ) printf( "s\n" );
+	else                               printf( "d\n" );
+
+	//if ( bli_obj_is_scomplex( a ) &&
+	//     bli_obj_is_dcomplex( b ) &&
+	//     bli_obj_is_float( c ) )
+	{
+		printf( "bli_gemm_md(): stor precs after:   %d %d %d\n", bli_obj_prec( a ),
+		                                      bli_obj_prec( b ), bli_obj_prec( c ) );
+		printf( "bli_gemm_md(): targ precs after:   %d %d %d\n", bli_obj_target_prec( a ),
+		                               bli_obj_target_prec( b ), bli_obj_target_prec( c ) );
+		printf( "bli_gemm_md(): exec precs after:   %d %d %d\n", bli_obj_exec_prec( a ),
+		                                 bli_obj_exec_prec( b ), bli_obj_exec_prec( c ) );
+		printf( "bli_gemm_md(): stor domain after:  %d %d %d\n", bli_obj_domain( a ),
+		                                    bli_obj_domain( b ), bli_obj_domain( c ) );
+		printf( "bli_gemm_md(): targ domain after:  %d %d %d\n", bli_obj_target_domain( a ),
+		                             bli_obj_target_domain( b ), bli_obj_target_domain( c ) );
+		printf( "bli_gemm_md(): exec domain after:  %d %d %d\n", bli_obj_exec_domain( a ),
+		                               bli_obj_exec_domain( b ), bli_obj_exec_domain( c ) );
+	}
+#endif
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_ccr
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	// We assume that the requested computation domain is complex.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_COMPLEX;
+
+	// For ccr, the computation (ukernel) will be real, but the execution
+	// will appear complex to other parts of the implementation.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_COMPLEX;
+
+	// Here we construct the computation datatype, which for the ccr case
+	// is equal to the real projection of the execution datatype, and use
+	// that computation datatype to query the corresponding ukernel output
+	// preference.
+	const num_t  dt = BLIS_REAL | bli_obj_comp_prec( c );
+	const bool_t row_pref
+	      = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
+
+	// We can only perform this case of mixed-domain gemm, C += A*B where
+	// B is real, if the microkernel prefers column output. If it prefers
+	// row output, we must induce a transposition and perform C += A*B
+	// where A (formerly B) is real.
+	if ( row_pref )
+	{
+		bli_obj_swap( a, b );
+
+		bli_obj_induce_trans( a );
+		bli_obj_induce_trans( b );
+		bli_obj_induce_trans( c );
+
+		return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
+	}
+
+	// Create a local copy of the context and then prepare to use this
+	// context instead of the one passed in.
+	*cntx_local = **cntx;
+	*cntx = cntx_local;
+
+	// Copy the real domain blocksizes into the slots of their complex
+	// counterparts.
+	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
+	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
+	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
+	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
+	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+
+	// Halve both the real and complex MR's (which are both real MR's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr );
+
+	// Halve both the real and complex MC's (which are both real MC's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+
+	// Rather than check which complex datatype dt_comp refers to, we set
+	// the mixed-domain virtual microkernel for both types.
+	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
+	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_crc
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	// We assume that the requested computation domain is complex.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_COMPLEX;
+
+	// For crc, the computation (ukernel) will be real, but the execution
+	// will appear complex to other parts of the implementation.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_COMPLEX;
+
+	// Here we construct the computation datatype, which for the crc case
+	// is equal to the real projection of the execution datatype, and use
+	// that computation datatype to query the corresponding ukernel output
+	// preference.
+	const num_t  dt = BLIS_REAL | bli_obj_comp_prec( c );
+	const bool_t col_pref
+	      = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
+
+	// We can only perform this case of mixed-domain gemm, C += A*B where
+	// A is real, if the microkernel prefers row output. If it prefers
+	// column output, we must induce a transposition and perform C += A*B
+	// where B (formerly A) is real.
+	if ( col_pref )
+	{
+		bli_obj_swap( a, b );
+
+		bli_obj_induce_trans( a );
+		bli_obj_induce_trans( b );
+		bli_obj_induce_trans( c );
+
+		return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
+	}
+
+	// Create a local copy of the context and then prepare to use this
+	// context instead of the one passed in.
+	*cntx_local = **cntx;
+	*cntx = cntx_local;
+
+	// Copy the real domain blocksizes into the slots of their complex
+	// counterparts.
+	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
+	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
+	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
+	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
+	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+
+	// Halve both the real and complex NR's (which are both real NR's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr );
+
+	// Halve both the real and complex NC's (which are both real NC's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+
+	// Rather than check which complex datatype dt_comp refers to, we set
+	// the mixed-domain virtual microkernel for both types.
+	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
+	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_rcc
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	// We assume that the requested computation domain is complex.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_COMPLEX;
+
+	// For rcc, the computation (ukernel) will be real, and since the output
+	// matrix C is also real, so must be the execution domain.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_REAL;
+
+	// Create a local copy of the context and then prepare to use this
+	// context instead of the one passed in.
+	*cntx_local = **cntx;
+	*cntx = cntx_local;
+
+	// Copy the real domain blocksizes into the slots of their complex
+	// counterparts.
+	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
+	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
+	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
+	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
+	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+
+	// Halve both the real and complex KC's (which are both real KC's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc );
+
+	// Use the 1r pack schema for both A and B with the conjugation
+	// of A or B toggled (to produce ar * br - ai * bi).
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
+
+	bli_obj_toggle_conj( b );
+
+	// We also need to copy over the packm kernels from the 1m
+	// context. We query the address of that context here.
+	const num_t dt_comp = bli_obj_dt( a );
+	cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_comp );
+
+	func_t* cntx_funcs    = bli_cntx_packm_kers_buf( *cntx );
+	func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m );
+
+	for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i )
+	{
+		cntx_funcs[ i ] = cntx_1m_funcs[ i ];
+	}
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_crr
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	obj_t  c_real;
+#endif
+
+	// We assume that the requested computation domain is real.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_REAL;
+
+	// For crr, the computation (ukernel) will be real, and since we will
+	// be updating only the real part of the output matrix C, the exectuion
+	// domain is also real.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_REAL;
+
+	// Since the A*B product is real, we can update only the real part of
+	// C. Thus, we convert the obj_t for the complex matrix to one that
+	// represents only the real part. HOWEVER, there are two situations in
+	// which we forgo this trick:
+	// - If extra memory optimizations are enabled, we should leave C alone
+	//   since we'll be computing A*B to a temporary matrix and accumulating
+	//   that result back to C, and in order for that to work, we need to
+	//   allow that code to continue accessing C as a complex matrix.
+	// - Even if extra memory optimizations are diabled, logically projecting
+	//   C as a real matrix can still cause problems if beta is non-unit. In
+	//   that situation, the implementation won't get a chance to scale the
+	//   imaginary components of C by beta, and thus it would compute the
+	//   wrong answer. Thus, if beta is non-unit, we must leave C alone.
+#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	if ( bli_obj_equals( beta, &BLIS_ONE ) )
+	{
+		bli_obj_real_part( c, &c_real );
+
+		// Overwrite the complex obj_t with its real-only alias.
+		*c = c_real;
+	}
+#endif
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_rcr
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+	obj_t  a_real;
+
+	// We assume that the requested computation domain is real.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_REAL;
+
+	// For rcr, the computation (ukernel) will be real, and since the output
+	// matrix C is also real, so must be the execution domain.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_REAL;
+
+	// Convert the obj_t for the complex matrix to one that represents only
+	// the real part.
+	bli_obj_real_part( a, &a_real );
+
+	// Overwrite the complex obj_t with its real-only alias.
+	*a = a_real;
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_rrc
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+	obj_t  b_real;
+
+	// We assume that the requested computation domain is real.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_REAL;
+
+	// For rcr, the computation (ukernel) will be real, and since the output
+	// matrix C is also real, so must be the execution domain.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_REAL;
+
+	// Convert the obj_t for the complex matrix to one that represents only
+	// the real part.
+	bli_obj_real_part( b, &b_real );
+
+	// Overwrite the complex obj_t with its real-only alias.
+	*b = b_real;
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_rrr
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	// We assume that the requested computation domain is real.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_REAL;
+
+	// For rrr, the computation (ukernel) and execution domains are both
+	// real.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_REAL;
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+//                 cab
+mddm_t bli_gemm_md_ccc
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     )
+{
+	mddm_t doms;
+
+	// We assume that the requested computation domain is complex.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_COMPLEX;
+
+	// For ccc, the computation (ukernel) and execution domains are both
+	// complex.
+	doms.comp = BLIS_COMPLEX;
+	doms.exec = BLIS_COMPLEX;
+
+	// Set the pack schemas of objects A and B for normal execution.
+	bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a );
+	bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b );
+
+	// Return the computation and execution domains.
+	return doms;
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_gemm_md_front
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     )
+{
+	bli_init_once();
+
+	obj_t   a_local;
+	obj_t   b_local;
+	obj_t   c_local;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+	    bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// If alpha is zero, scale by beta and return.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+	}
+
+	cntx_t cntx_local;
+
+	// Handle mixed domain cases in bli_gemm_md(), which may modify
+	// the objects or the context. (If the context is modified, cntx
+	// is adjusted to point to cntx_local.)
+	bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
+
+	// Record the threading for each level within the context.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  bli_gemm_int,
+	  BLIS_GEMM, // operation family id
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  rntm,
+	  cntl
+	);
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_gemm_md_zgemm
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     )
+{
+	bli_init_once();
+
+	obj_t   a_local;
+	obj_t   b_local;
+	obj_t   c_local;
+
+#if 1
+	obj_t   am, bm, cm;
+	obj_t*  c_orig;
+
+	//if ( is_md == TRUE )
+	{
+		//num_t dt_c2 = bli_obj_dt( c );
+		//num_t dt_c1 = bli_dt_proj_to_complex( dt_c2 );
+		//num_t dt_c  = bli_dt_proj_to_double_prec( dt_c1 );
+		//num_t dt_c = bli_obj_dt_proj_to_complex( c );
+		num_t dt_c = BLIS_DCOMPLEX;
+
+		if ( bli_obj_is_single_prec( c ) ) dt_c = BLIS_SCOMPLEX;
+		else                               dt_c = BLIS_DCOMPLEX;
+
+		if ( bli_obj_is_real( a ) &&
+		     bli_obj_is_real( b ) &&
+		     bli_obj_is_real( c ) ) dt_c = bli_dt_proj_to_real( dt_c );
+
+		dim_t m = bli_obj_length( c );
+		dim_t n = bli_obj_width( c );
+		dim_t k = bli_obj_width_after_trans( a );
+
+		bli_obj_create( dt_c, m, k, 0, 0, &am );
+		bli_obj_create( dt_c, k, n, 0, 0, &bm );
+		bli_obj_create( dt_c, m, n, 0, 0, &cm );
+
+		//bli_projm( a, &am );
+		//bli_projm( b, &bm );
+		//bli_projm( c, &cm );
+		bli_castm( a, &am );
+		bli_castm( b, &bm );
+		bli_castm( c, &cm );
+
+		c_orig = c;
+
+		a = &am;
+		b = &bm;
+		c = &cm;
+	}
+#endif
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// If alpha is zero, scale by beta and return.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+	}
+
+	{
+		// A sort of hack for communicating the desired pach schemas for A and B
+		// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
+		// bli_l3_cntl_create_if()). This allows us to access the schemas from
+		// the control tree, which hopefully reduces some confusion, particularly
+		// in bli_packm_init().
+		if ( bli_cntx_method( cntx ) == BLIS_NAT )
+		{
+			bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
+			bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
+		}
+		else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
+		{
+			pack_t schema_a = bli_cntx_schema_a_block( cntx );
+			pack_t schema_b = bli_cntx_schema_b_panel( cntx );
+
+			bli_obj_set_pack_schema( schema_a, &a_local );
+			bli_obj_set_pack_schema( schema_b, &b_local );
+		}
+	}
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  bli_gemm_int,
+	  BLIS_GEMM, // operation family id
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  rntm,
+	  cntl
+	);
+
+#if 1
+	//if ( is_md == TRUE )
+	{
+		//bli_projm( &cm, c_orig );
+		bli_castm( &cm, c_orig );
+
+		bli_obj_free( &am );
+		bli_obj_free( &bm );
+		bli_obj_free( &cm );
+	}
+#endif
+}
+
+#endif
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
new file mode 100644
index 000000000..ec88ebff1
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_md.h
@@ -0,0 +1,327 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "bli_gemm_md_c2r_ref.h"
+
+// Define a local struct type that makes returning two values easier.
+typedef struct mddm_s
+{
+	dom_t comp;
+	dom_t exec;
+} mddm_t;
+
+void bli_gemm_md
+     (
+       obj_t*   a,
+       obj_t*   b,
+       obj_t*   beta,
+       obj_t*   c,
+       cntx_t*  cntx_local,
+       cntx_t** cntx
+     );
+mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+
+// -----------------------------------------------------------------------------
+
+void bli_gemm_md_front
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     );
+
+void bli_gemm_md_zgemm
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     );
+
+// -----------------------------------------------------------------------------
+
+static bool_t bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
+{
+	bool_t r_val = FALSE;
+
+	// NOTE: The last conditional subexpression is necessary if/when we
+	// allow the user to specify the computation domain. (The computation
+	// domain is currently ignored, but once it is honored as a user-
+	// settable value, it will affect the execution domain, which is what
+	// is checked below. Until then, the last expression is not actually
+	// necessary since crr is already unconditionally associated with an
+	// execution domain of BLIS_REAL.)
+	if ( bli_obj_is_complex( c ) &&
+	     bli_obj_is_real( a )    &&
+	     bli_obj_is_real( b )    &&
+	     bli_obj_exec_domain( c ) == BLIS_REAL )
+		r_val = TRUE;
+
+	return r_val;
+}
+
+static bool_t bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
+{
+	bool_t r_val = FALSE;
+
+	// NOTE: The last conditional subexpression is necessary if/when we
+	// allow the user to specify the computation domain. (The computation
+	// domain is currently ignored, but once it is honored as a user-
+	// settable value, it will affect the execution domain, which is what
+	// is checked below. Until then, the last expression is not actually
+	// necessary since ccr is already unconditionally associated with an
+	// execution domain of BLIS_COMPLEX.)
+	if ( bli_obj_is_complex( c ) &&
+	     bli_obj_is_complex( a ) &&
+	     bli_obj_is_real( b )    &&
+	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
+		r_val = TRUE;
+
+	return r_val;
+}
+
+static bool_t bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
+{
+	bool_t r_val = FALSE;
+
+	// NOTE: The last conditional subexpression is necessary if/when we
+	// allow the user to specify the computation domain. (The computation
+	// domain is currently ignored, but once it is honored as a user-
+	// settable value, it will affect the execution domain, which is what
+	// is checked below. Until then, the last expression is not actually
+	// necessary since crc is already unconditionally associated with an
+	// execution domain of BLIS_COMPLEX.)
+	if ( bli_obj_is_complex( c ) &&
+	     bli_obj_is_real( a )    &&
+	     bli_obj_is_complex( b ) &&
+	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
+		r_val = TRUE;
+
+	return r_val;
+}
+
+// -----------------------------------------------------------------------------
+
+static void bli_gemm_md_ker_var2_recast
+     (
+       num_t* dt_comp,
+       num_t  dt_a,
+       num_t  dt_b,
+       num_t  dt_c,
+       dim_t* m,
+       dim_t* n,
+       dim_t* k,
+       inc_t* pd_a, inc_t* ps_a,
+       inc_t* pd_b, inc_t* ps_b,
+       obj_t* c,
+       inc_t* rs_c, inc_t* cs_c
+     )
+{
+	if      ( bli_is_real( dt_c )    &&
+	          bli_is_complex( dt_a ) &&
+	          bli_is_complex( dt_b ) )
+	{
+		// The rcc case is executed with a real macrokernel, so we need to
+		// double the k dimension (because both A and B are packed to the 1r
+		// schema), and also the panel strides of A and B since they were
+		// packed as complex matrices and we now need to convert them to
+		// units of real elements.
+		*k *= 2;
+		*ps_a *= 2;
+		*ps_b *= 2;
+	}
+	else if ( bli_is_complex( dt_c ) &&
+	          bli_is_real( dt_a )    &&
+	          bli_is_complex( dt_b ) )
+	{
+#if 1
+		obj_t beta;
+
+		bli_obj_scalar_detach( c, &beta );
+
+		if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
+		     bli_obj_imag_is_zero( &beta ) &&
+		     bli_is_row_stored( *rs_c, *cs_c ) &&
+		     bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
+		{
+			// If beta is real, and C is not general-stored, and the computation
+			// precision is equal to the storage precision of C, we can use the
+			// real macrokernel (and real microkernel, which is already stored
+			// to the real virtual microkernel slots of the context) instead of
+			// the complex macrokernel and c2r virtual microkernel.
+			*dt_comp = bli_dt_proj_to_real( *dt_comp );
+			*n *= 2;
+			*pd_b *= 2; *ps_b *= 2;
+			*rs_c *= 2;
+		}
+		else
+#endif
+		{
+			// Generally speaking, the crc case is executed with a complex
+			// macrokernel, so we need to halve the panel stride of A (which
+			// is real) since the macrokernel will perform the pointer
+			// arithmetic in units of complex elements.
+			*ps_a /= 2;
+		}
+	}
+	else if ( bli_is_complex( dt_c ) &&
+	          bli_is_complex( dt_a ) &&
+	          bli_is_real( dt_b ) )
+	{
+#if 1
+		obj_t beta;
+
+		bli_obj_scalar_detach( c, &beta );
+
+		if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
+		     bli_obj_imag_is_zero( &beta ) &&
+		     bli_is_col_stored( *rs_c, *cs_c ) &&
+		     bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
+		{
+			// If beta is real, and C is not general-stored, and the computation
+			// precision is equal to the storage precision of C, we can use the
+			// real macrokernel (and real microkernel, which is already stored
+			// to the real virtual microkernel slots of the context) instead of
+			// the complex macrokernel and c2r virtual microkernel.
+			*dt_comp = bli_dt_proj_to_real( *dt_comp );
+			*m *= 2;
+			*pd_a *= 2; *ps_a *= 2;
+			*cs_c *= 2;
+		}
+		else
+#endif
+		{
+			// Generally speaking, the ccr case is executed with a complex
+			// macrokernel, so we need to halve the panel stride of B (which
+			// is real) since the macrokernel will perform the pointer
+			// arithmetic in units of complex elements.
+			*ps_b /= 2;
+		}
+	}
+#if 0
+	else if ( bli_is_real( dt_c ) &&
+	          bli_is_real( dt_a ) &&
+	          bli_is_real( dt_b ) )
+	{
+		// No action needed.
+//printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k );
+	}
+	else if ( bli_is_complex( dt_c ) &&
+	          bli_is_real( dt_a ) &&
+	          bli_is_real( dt_b ) )
+	{
+		// No action needed.
+	}
+	else if ( bli_is_real( dt_c ) &&
+	          bli_is_complex( dt_a ) &&
+	          bli_is_real( dt_b ) )
+	{
+		// No action needed.
+	}
+	else if ( bli_is_real( dt_c ) &&
+	          bli_is_real( dt_a ) &&
+	          bli_is_complex( dt_b ) )
+	{
+		// No action needed.
+	}
+#endif
+}
+
+// -----------------------------------------------------------------------------
+
+//
+// Prototype object-based interfaces.
+//
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTEMAC0(opname) \
+     ( \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       cntl_t* cntl, \
+       thrinfo_t* thread  \
+     );
+
+GENPROT( gemm_ker_var2_md )
+
+//
+// Prototype BLAS-like interfaces with void pointer operands.
+//
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_c, ctype_e, chc, che, varname ) \
+\
+void PASTEMAC2(chc,che,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     );
+
+INSERT_GENTPROT2_BASIC0( gemm_ker_var2_md )
+INSERT_GENTPROT2_MIXDP0( gemm_ker_var2_md )
+
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
new file mode 100644
index 000000000..f1479b5b1
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -0,0 +1,223 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef BLIS_ENABLE_GEMM_MD
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \
+\
+void PASTEMAC2(ch,opname,suf) \
+     ( \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, \
+       ctype*     restrict b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	const num_t       dt        = PASTEMAC(ch,type); \
+	const num_t       dt_r      = PASTEMAC(chr,type); \
+\
+	PASTECH(chr,gemm_ukr_ft) \
+	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool_t      col_pref  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool_t      row_pref  = !col_pref; \
+\
+	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
+	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                      / sizeof( ctype_r ) ] \
+	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	inc_t             rs_ct; \
+	inc_t             cs_ct; \
+\
+	ctype_r* restrict a_r       = ( ctype_r* )a; \
+\
+	ctype_r* restrict b_r       = ( ctype_r* )b; \
+\
+	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
+\
+	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
+/*
+	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
+*/ \
+\
+	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
+	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+\
+	ctype_r*          c_use; \
+	inc_t             rs_c_use; \
+	inc_t             cs_c_use; \
+\
+	bool_t            using_ct; \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "gemm_ukr: a", 2*mr, k, \
+                     a_r, 1, 6, "%5.2f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "gemm_ukr: b", k, nr, \
+                     b_r, 8, 1, "%5.2f", "" ); \
+*/ \
+\
+	/* SAFETY CHECK: The higher level implementation should never
+	   allow an alpha with non-zero imaginary component to be passed
+	   in, because it can't be applied properly using the 1m method.
+	   If alpha is not real, then something is very wrong. */ \
+/*
+	if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+*/ \
+\
+	/* If beta has a non-zero imaginary component OR if c is stored with
+	   general stride, then we compute the alpha*a*b product into temporary
+	   storage and then accumulate that result into c afterwards. Note that
+	   the other two cases concerning disagreement between the storage of C
+	   and the output preference of the micro-kernel, should ONLY occur in
+	   the context of trsm, whereby this virtual micro-kernel is called
+	   directly from the trsm macro-kernel to update the micro-tile b11
+	   that exists within the packed row-panel of B. Indeed that is the
+	   reason those cases MUST be explicitly handled. */ \
+	if      ( !PASTEMAC(chr,eq0)( *beta_i ) )               using_ct = TRUE; \
+	else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
+	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
+	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
+	else                                                    using_ct = FALSE; \
+\
+\
+	if ( using_ct ) \
+	{ \
+		/* In the atypical cases, we compute the result into temporary
+		   workspace ct and then accumulated it back to c at the end. */ \
+\
+		/* Set the strides of ct based on the preference of the underlying
+		   native real domain gemm micro-kernel. Note that we set the ct
+		   strides in units of complex elements. */ \
+		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
+		else            { rs_ct = nr; cs_ct = 1; } \
+\
+		c_use    = ( ctype_r* )ct; \
+		rs_c_use = rs_ct; \
+		cs_c_use = cs_ct; \
+\
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
+		else                                           rs_c_use *= 2; \
+\
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  k, \
+		  alpha_r, \
+		  a_r, \
+		  b_r, \
+		  zero_r, \
+		  c_use, rs_c_use, cs_c_use, \
+		  data, \
+		  cntx  \
+		); \
+\
+		dim_t i, j; \
+\
+		/* Accumulate the final result in ct back to c. */ \
+		if ( PASTEMAC(ch,eq1)( *beta ) ) \
+		{ \
+			for ( j = 0; j < nr; ++j ) \
+			for ( i = 0; i < mr; ++i ) \
+			{ \
+				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
+				                   *(c  + i*rs_c  + j*cs_c ) ); \
+			} \
+		} \
+		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
+		{ \
+			for ( j = 0; j < nr; ++j ) \
+			for ( i = 0; i < mr; ++i ) \
+			{ \
+				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
+				                    *(c  + i*rs_c  + j*cs_c ) ); \
+			} \
+		} \
+		else /*if ( !PASTEMAC(ch,eq1)( *beta ) )*/ \
+		{ \
+			for ( j = 0; j < nr; ++j ) \
+			for ( i = 0; i < mr; ++i ) \
+			{ \
+				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
+				                    *beta, \
+				                    *(c  + i*rs_c  + j*cs_c ) ); \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		/* In the typical cases, we use the real part of beta and
+		   accumulate directly into the output matrix c. */ \
+\
+		c_use    = ( ctype_r* )c; \
+		rs_c_use = rs_c; \
+		cs_c_use = cs_c; \
+\
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
+		else                                           rs_c_use *= 2; \
+\
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  k, \
+		  alpha_r, \
+		  a_r, \
+		  b_r, \
+		  beta_r, \
+		  c_use, rs_c_use, cs_c_use, \
+		  data, \
+		  cntx  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNCCO_BASIC( gemm_md_c2r, BLIS_REF_SUFFIX )
+
+#endif
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.h b/frame/3/gemm/bli_gemm_md_c2r_ref.h
new file mode 100644
index 000000000..fa5893e2f
--- /dev/null
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.h
@@ -0,0 +1,41 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// -- Level-3 native micro-kernel prototype redefinitions ----------------------
+
+#undef  gemm_ukr_name
+#define gemm_ukr_name   gemm_md_c2r_ref
+
+// Include the native micro-kernel API template.
+#include "bli_l3_ukr.h"
diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c
index 93c014051..f45542d37 100644
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/bli_herk_l_ker_var2.c
@@ -278,6 +278,9 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c
index 5875c3317..3061a5c39 100644
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/bli_herk_u_ker_var2.c
@@ -278,6 +278,9 @@ void PASTEMAC(ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index ff64501aa..eef104eed 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -316,6 +316,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index bfe57ba16..23dd22cb8 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -323,6 +323,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index e2eef964e..ae44e8ff9 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -323,6 +323,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of A to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index c76bc535f..9d7ec4cfe 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -324,6 +324,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of A to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 34fc6a2b6..021f8baf2 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -338,6 +338,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 78e2a7a15..0ddcd16d4 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -346,6 +346,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the imaginary stride of B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 8045fe09d..1cf456678 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -368,6 +368,9 @@ void PASTEMAC(ch,varname) \
 	   NOTE: We swap the values for A and B since the triangular
 	   "A" matrix is actually contained within B. */ \
 	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index e1279813c..b5a76d03a 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -363,6 +363,9 @@ void PASTEMAC(ch,varname) \
 	   NOTE: We swap the values for A and B since the triangular
 	   "A" matrix is actually contained within B. */ \
 	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index 09595968e..cf31ffa0f 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -65,10 +65,12 @@ static inc_t bli_auxinfo_is_b( auxinfo_t* ai )
 	return ai->is_b;
 }
 
+#if 0
 static inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai )
 {
 	return ai->dt_on_output;
 }
+#endif
 
 
 // auxinfo_t field modification
@@ -105,10 +107,12 @@ static void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai )
 	ai->is_b = is;
 }
 
+#if 0
 static void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai )
 {
 	ai->dt_on_output = dt_on_output;
 }
+#endif
 
 #endif 
 
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 3cfc6c39c..f8ed2663f 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -393,7 +393,9 @@ static bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cnt
 
 static bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
 {
-	const num_t  dt    = bli_obj_dt( obj );
+	// Note that we use the computation datatype, which may differ from the
+	// storage datatype of C (when performing a mixed datatype operation).
+	const num_t  dt    = bli_obj_comp_dt( obj );
 	const bool_t ukr_prefers_rows
 	                   = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 	const bool_t ukr_prefers_cols
@@ -442,9 +444,9 @@ static bool_t bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cnt
 
 static bool_t bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
 {
-	// Note that we use the execution datatype, which may differ from the
-	// storage datatype of C (though this would happen in very few situations).
-	const num_t  dt    = bli_obj_exec_dt( obj );
+	// Note that we use the computation datatype, which may differ from the
+	// storage datatype of C (when performing a mixed datatype operation).
+	const num_t  dt    = bli_obj_comp_dt( obj );
 	const bool_t ukr_prefers_rows
 	                   = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 	const bool_t ukr_prefers_cols
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index 407f31cb5..722b5b93a 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -108,6 +108,7 @@ void bli_obj_create_without_buffer
 	bli_obj_set_elem_size( elem_size, obj );
 	bli_obj_set_target_dt( dt, obj );
 	bli_obj_set_exec_dt( dt, obj );
+	bli_obj_set_comp_dt( dt, obj );
 	bli_obj_set_dims( m, n, obj );
 	bli_obj_set_offs( 0, 0, obj );
 	bli_obj_set_diag_offset( 0, obj );
@@ -115,8 +116,14 @@ void bli_obj_create_without_buffer
 	// Set the internal scalar to 1.0.
 	s = bli_obj_internal_scalar_buffer( obj );
 
-	if      ( bli_is_float( dt )    ) { bli_sset1s( *(( float*    )s) ); }
-	else if ( bli_is_double( dt )   ) { bli_dset1s( *(( double*   )s) ); }
+	// Always writing the imaginary component is needed in mixed-domain
+	// scenarios. Failing to do this can lead to reading uninitialized
+	// memory just before calling the macrokernel (as the internal scalars
+	// for A and B are merged).
+	//if      ( bli_is_float( dt )    ) { bli_sset1s( *(( float*    )s) ); }
+	//else if ( bli_is_double( dt )   ) { bli_dset1s( *(( double*   )s) ); }
+	if      ( bli_is_float( dt )    ) { bli_cset1s( *(( scomplex* )s) ); }
+	else if ( bli_is_double( dt )   ) { bli_zset1s( *(( dcomplex* )s) ); }
 	else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); }
 	else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); }
 }
diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c
index 73446830f..b4091a38f 100644
--- a/frame/base/bli_query.c
+++ b/frame/base/bli_query.c
@@ -86,6 +86,7 @@ bool_t bli_obj_equals( obj_t* a,
 bool_t bli_obj_imag_equals( obj_t* a,
                             obj_t* b )
 {
+#if 0
 	bool_t r_val = FALSE;
 	num_t  dt_a;
 	num_t  dt_b;
@@ -128,7 +129,51 @@ bool_t bli_obj_imag_equals( obj_t* a,
 			r_val = bli_deq( bli_zimag( *ap_z ), *bp_z );
 		}
 	}
+#endif
+	bool_t r_val = FALSE;
+
+	// The function is not yet implemented for vectors and matrices.
+	if ( !bli_obj_is_1x1( a ) ||
+	     !bli_obj_is_1x1( b ) ||
+	     bli_obj_is_complex( b ) )
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+	double a_r, a_i;
+	double b_r, b_i;
+
+	// Get the real and imaginary parts of a and cast them to local doubles.
+	bli_getsc( a, &a_r, &a_i );
+
+	// Get the value of b and cast to a local double. (Note: the imaginary part
+	// of b is ignored since we know b is real.)
+	bli_getsc( b, &b_r, &b_i );
+
+	// Compare the imaginary part of a to the real part of b.
+	if ( a_i == b_r ) r_val = TRUE;
 
 	return r_val;
 }
 
+bool_t bli_obj_imag_is_zero( obj_t* a )
+{
+	bool_t r_val = TRUE;
+
+	// The function is not yet implemented for vectors and matrices.
+	if ( !bli_obj_is_1x1( a ) )
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+	if ( bli_obj_is_complex( a ) )
+	{
+		double a_r, a_i;
+
+		// Get the real and imaginary parts and cast them to local doubles.
+		bli_getsc( a, &a_r, &a_i );
+
+		// Compare the imaginary part of a to double-precision zero.
+		if ( !bli_deq0( a_i ) ) r_val = FALSE;
+	}
+
+	return r_val;
+}
+
+
diff --git a/frame/base/bli_query.h b/frame/base/bli_query.h
index 827752103..afc01cd5a 100644
--- a/frame/base/bli_query.h
+++ b/frame/base/bli_query.h
@@ -37,3 +37,5 @@ bool_t bli_obj_equals( obj_t* a,
 
 bool_t bli_obj_imag_equals( obj_t* a,
                             obj_t* b );
+
+bool_t bli_obj_imag_is_zero( obj_t* a );
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
new file mode 100644
index 000000000..548e6410f
--- /dev/null
+++ b/frame/base/cast/bli_castnzm.c
@@ -0,0 +1,267 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// NOTE: This is one of the few functions in BLIS that is defined
+// with heterogeneous type support. This is done so that we have
+// an operation that can be used to typecast (copy-cast) a matrix
+// of one datatype to a scalar of another datatype.
+
+typedef void (*FUNCPTR_T)
+     (
+       trans_t        transa,
+       dim_t          m,
+       dim_t          n,
+       void* restrict a, inc_t rs_a, inc_t cs_a,
+       void* restrict b, inc_t rs_b, inc_t cs_b
+     );
+
+static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
+
+//
+// Define object-based interface.
+//
+
+void bli_castnzm
+     (
+       obj_t* a,
+       obj_t* b
+     )
+{
+	num_t     dt_a     = bli_obj_dt( a );
+	num_t     dt_b     = bli_obj_dt( b );
+
+	trans_t   transa   = bli_obj_conjtrans_status( a );
+
+	dim_t     m        = bli_obj_length( b );
+	dim_t     n        = bli_obj_width( b );
+
+	void*     buf_a    = bli_obj_buffer_at_off( a );
+	inc_t     rs_a     = bli_obj_row_stride( a );
+	inc_t     cs_a     = bli_obj_col_stride( a );
+
+	void*     buf_b    = bli_obj_buffer_at_off( b );
+	inc_t     rs_b     = bli_obj_row_stride( b );
+	inc_t     cs_b     = bli_obj_col_stride( b );
+
+	FUNCPTR_T f;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_castnzm_check( a, b );
+
+#if 0
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) )
+	{
+		// If a and b share the same datatype, we can simply use copym.
+		bli_copym( a, b );
+		return;
+	}
+#endif
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_a][dt_b];
+
+	// Invoke the void pointer-based function.
+	f
+	(
+	  transa,
+	  m,
+	  n,
+	  buf_a, rs_a, cs_a,
+	  buf_b, rs_b, cs_b
+	);
+}
+
+// -----------------------------------------------------------------------------
+
+//
+// Define BLAS-like interfaces with typed operands.
+//
+
+#undef  GENTFUNC2
+#define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \
+\
+void PASTEMAC2(cha,chb,opname) \
+     ( \
+       trans_t        transa, \
+       dim_t          m, \
+       dim_t          n, \
+       void* restrict a, inc_t rs_a, inc_t cs_a, \
+       void* restrict b, inc_t rs_b, inc_t cs_b  \
+     ) \
+{ \
+	ctype_a* restrict a_cast = a; \
+	ctype_b* restrict b_cast = b; \
+	conj_t            conja; \
+	dim_t             n_iter; \
+	dim_t             n_elem; \
+	inc_t             lda, inca; \
+	inc_t             ldb, incb; \
+	dim_t             j, i; \
+\
+	/* Set various loop parameters. */ \
+	bli_set_dims_incs_2m \
+	( \
+	  transa, \
+	  m,       n,       rs_a,  cs_a, rs_b,  cs_b, \
+	  &n_elem, &n_iter, &inca, &lda, &incb, &ldb  \
+	); \
+\
+	/* Extract the conjugation component from the transa parameter. */ \
+	conja = bli_extract_conj( transa ); \
+\
+	if ( bli_is_conj( conja ) ) \
+	{ \
+		if ( inca == 1 && incb == 1 ) \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(cha,chb,copyjnzs)( a1[i], b1[i] ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(cha,chb,copyjnzs)( *a1, *b1 ); \
+\
+					a1 += inca; \
+					b1 += incb; \
+				} \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		if ( inca == 1 && incb == 1 ) \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(cha,chb,copynzs)( a1[i], b1[i] ); \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			for ( j = 0; j < n_iter; ++j ) \
+			{ \
+				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+\
+				for ( i = 0; i < n_elem; ++i ) \
+				{ \
+					PASTEMAC2(cha,chb,copynzs)( *a1, *b1 ); \
+\
+					a1 += inca; \
+					b1 += incb; \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC2_BASIC0( castnzm )
+INSERT_GENTFUNC2_MIXDP0( castnzm )
+
+// -----------------------------------------------------------------------------
+
+//
+// Define object-based _check() function.
+//
+
+void bli_castnzm_check
+     (
+       obj_t* a,
+       obj_t* b
+     )
+{
+	err_t e_val;
+
+	// Check object datatypes.
+
+	e_val = bli_check_floating_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( b );
+	bli_check_error_code( e_val );
+
+	// Check structure.
+	// NOTE: We enforce general structure for now in order to simplify the
+	// implementation.
+
+	bli_check_general_object( a );
+	bli_check_error_code( e_val );
+
+	bli_check_general_object( b );
+	bli_check_error_code( e_val );
+
+	// Check object dimensions.
+
+	e_val = bli_check_matrix_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_matrix_object( b );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_conformal_dims( a, b );
+	bli_check_error_code( e_val );
+
+	// Check object buffers (for non-NULLness).
+
+	e_val = bli_check_object_buffer( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_object_buffer( b );
+	bli_check_error_code( e_val );
+}
+
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
new file mode 100644
index 000000000..7770515b8
--- /dev/null
+++ b/frame/base/cast/bli_castnzm.h
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//
+// Prototype object-based interface.
+//
+
+void bli_castnzm
+     (
+       obj_t* a,
+       obj_t* b
+     );
+
+//
+// Prototype BLAS-like interfaces with heterogeneous-typed operands.
+//
+
+#undef  GENTPROT2
+#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \
+\
+void PASTEMAC2(cha,chb,opname) \
+     ( \
+       trans_t transa, \
+       dim_t   m, \
+       dim_t   n, \
+       void*   a, inc_t rs_a, inc_t cs_a, \
+       void*   b, inc_t rs_b, inc_t cs_b  \
+     );
+
+INSERT_GENTPROT2_BASIC0( castnzm )
+INSERT_GENTPROT2_MIXDP0( castnzm )
+
+//
+// Prototype object-based _check() function.
+//
+
+void bli_castnzm_check
+     (
+       obj_t* a,
+       obj_t* b
+     );
+
diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index 4c867b719..9a0a905e4 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -67,7 +67,7 @@ void PASTEF772(chx,cha,blasname) \
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
-	PASTEMAC2(cha,chx,cast)( (ftype_a*)alpha, alpha_cast ); \
+	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index 8c1f2efc9..efda67931 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -99,6 +99,25 @@
 #endif
 
 
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Enable mixed datatype support?
+#ifdef BLIS_DISABLE_MIXED_DT
+  #undef BLIS_ENABLE_GEMM_MD
+#else
+  // Default behavior is enabled.
+  #define BLIS_ENABLE_GEMM_MD
+#endif
+
+// Enable memory-intensive optimizations for mixed datatype support?
+#ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM
+  #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+#else
+  // Default behavior is enabled.
+  #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+#endif
+
+
 // -- MISCELLANEOUS OPTIONS ----------------------------------------------------
 
 // Do NOT require the cross-blocksize constraints. That is, do not enforce
diff --git a/frame/include/bli_genarray_macro_defs.h b/frame/include/bli_genarray_macro_defs.h
index e288dbac2..556fa7542 100644
--- a/frame/include/bli_genarray_macro_defs.h
+++ b/frame/include/bli_genarray_macro_defs.h
@@ -65,6 +65,18 @@ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \
 
 // -- "Smart" two-operand macro --
 
+#define GENARRAY_FPA2(tname,op) \
+\
+static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \
+{ \
+	{ ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \
+	{ ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \
+	{ ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \
+	{ ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) }  \
+}
+
+// -- "Smart" two-operand macro --
+
 /*
 #define GENARRAY2_VFP(arrayname,op) \
 \
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 22b8d6302..57e5e2b3e 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -529,6 +529,52 @@ GENTFUNC2R( dcomplex, scomplex, double,   z, c, d, tfuncname, varname )
 
 
 
+// -- Mixed domain/precision (all) two-operand macro with real projection of first operand --
+
+// -- (no auxiliary arguments) --
+
+#define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \
+\
+GENTFUNC2( float,    double,   s, d, tfuncname ) \
+GENTFUNC2( float,    scomplex, s, c, tfuncname ) \
+GENTFUNC2( float,    dcomplex, s, z, tfuncname ) \
+\
+GENTFUNC2( double,   float,    d, s, tfuncname ) \
+GENTFUNC2( double,   scomplex, d, c, tfuncname ) \
+GENTFUNC2( double,   dcomplex, d, z, tfuncname ) \
+\
+GENTFUNC2( scomplex, float,    c, s, tfuncname ) \
+GENTFUNC2( scomplex, double,   c, d, tfuncname ) \
+GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \
+\
+GENTFUNC2( dcomplex, float,    z, s, tfuncname ) \
+GENTFUNC2( dcomplex, double,   z, d, tfuncname ) \
+GENTFUNC2( dcomplex, scomplex, z, c, tfuncname )
+
+
+// -- (one auxiliary argument) --
+
+#define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \
+\
+GENTFUNC2( float,    double,   s, d, tfuncname, varname ) \
+GENTFUNC2( float,    scomplex, s, c, tfuncname, varname ) \
+GENTFUNC2( float,    dcomplex, s, z, tfuncname, varname ) \
+\
+GENTFUNC2( double,   float,    d, s, tfuncname, varname ) \
+GENTFUNC2( double,   scomplex, d, c, tfuncname, varname ) \
+GENTFUNC2( double,   dcomplex, d, z, tfuncname, varname ) \
+\
+GENTFUNC2( scomplex, float,    c, s, tfuncname, varname ) \
+GENTFUNC2( scomplex, double,   c, d, tfuncname, varname ) \
+GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \
+\
+GENTFUNC2( dcomplex, float,    z, s, tfuncname, varname ) \
+GENTFUNC2( dcomplex, double,   z, d, tfuncname, varname ) \
+GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname )
+
+
+
+
 // -- Macros for functions with three primary operands -------------------------
 
 
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 71a3fa2b1..f0f9761f5 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -136,6 +136,20 @@ static void bli_toggle_bool( bool_t* b )
 #define bli_ctype ( BLIS_SCOMPLEX )
 #define bli_ztype ( BLIS_DCOMPLEX )
 
+// return C type for char
+
+#define bli_sctype  float
+#define bli_dctype  double
+#define bli_cctype  scomplex
+#define bli_zctype  dcomplex
+
+// return real proj of C type for char
+
+#define bli_sctyper  float
+#define bli_dctyper  double
+#define bli_cctyper  float
+#define bli_zctyper  double
+
 
 // return default format specifier for char
 
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 77fd524dd..acc32ccbb 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -122,13 +122,15 @@ static num_t bli_obj_dt_proj_to_double_prec( obj_t* obj )
 static bool_t bli_obj_is_real( obj_t* obj )
 {
 	return ( bool_t )
-	       ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL );
+	       ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL &&
+	         !bli_obj_is_const( obj ) );
 }
 
 static bool_t bli_obj_is_complex( obj_t* obj )
 {
 	return ( bool_t )
-	       ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX );
+	       ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX &&
+	         !bli_obj_is_const( obj ) );
 }
 
 static num_t bli_obj_dt_proj_to_real( obj_t* obj )
@@ -179,6 +181,24 @@ static prec_t bli_obj_exec_prec( obj_t* obj )
 	       ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT );
 }
 
+static num_t bli_obj_comp_dt( obj_t* obj )
+{
+	return ( num_t )
+	       ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT );
+}
+
+static dom_t bli_obj_comp_domain( obj_t* obj )
+{
+	return ( dom_t )
+	       ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT );
+}
+
+static prec_t bli_obj_comp_prec( obj_t* obj )
+{
+	return ( prec_t )
+	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT );
+}
+
 static trans_t bli_obj_conjtrans_status( obj_t* obj )
 {
 	return ( trans_t )
@@ -454,6 +474,24 @@ static void bli_obj_set_exec_prec( prec_t dt, obj_t* obj )
 	            ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT );
 }
 
+static void bli_obj_set_comp_dt( num_t dt, obj_t* obj )
+{
+	obj->info = ( objbits_t )
+	            ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT );
+}
+
+static void bli_obj_set_comp_domain( dom_t dt, obj_t* obj )
+{
+	obj->info = ( objbits_t )
+	            ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT );
+}
+
+static void bli_obj_set_comp_prec( prec_t dt, obj_t* obj )
+{
+	obj->info = ( objbits_t )
+	            ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT );
+}
+
 static void bli_obj_set_pack_schema( pack_t schema, obj_t* obj )
 {
 	obj->info = ( objbits_t )
@@ -1183,9 +1221,11 @@ static void bli_obj_real_part( obj_t* c, obj_t* r )
 		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
 		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
 		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
+		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
 		bli_obj_set_dt(        dt_stor_r, r );
 		bli_obj_set_target_dt( dt_targ_r, r );
 		bli_obj_set_exec_dt(   dt_exec_r, r );
+		bli_obj_set_comp_dt(   dt_comp_r, r );
 
 		// Update the element size.
 		siz_t es_c = bli_obj_elem_size( c );
@@ -1212,9 +1252,11 @@ static void bli_obj_imag_part( obj_t* c, obj_t* i )
 		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
 		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
 		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
+		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
 		bli_obj_set_dt(        dt_stor_r, i );
 		bli_obj_set_target_dt( dt_targ_r, i );
 		bli_obj_set_exec_dt(   dt_exec_r, i );
+		bli_obj_set_comp_dt(   dt_comp_r, i );
 
 		// Update the element size.
 		siz_t es_c = bli_obj_elem_size( c );
@@ -1251,13 +1293,24 @@ static void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, v
 	}
 }
 
-// Swap object contents.
+// Swap all object fields (metadata/properties).
 
 static void bli_obj_swap( obj_t* a, obj_t* b )
 {
 	obj_t t = *b; *b = *a; *a = t;
 }
 
+// Swap object pack schemas.
+
+static void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b )
+{
+	const pack_t schema_a = bli_obj_pack_schema( a );
+	const pack_t schema_b = bli_obj_pack_schema( b );
+
+	bli_obj_set_pack_schema( schema_b, a );
+	bli_obj_set_pack_schema( schema_a, b );
+}
+
 // Induce a transposition on an object: swap dimensions, increments, and
 // offsets, then clear the trans bit.
 
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index 0c96e7f84..6828bdfe9 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -140,14 +140,15 @@
 
 #include "bli_axmys.h"
 
-#include "bli_cast.h"
-
 #include "bli_conjs.h"
 
 #include "bli_copys.h"
 #include "bli_copyjs.h"
 #include "bli_copycjs.h"
 
+#include "bli_copynzs.h"
+#include "bli_copyjnzs.h"
+
 #include "bli_dots.h"
 #include "bli_dotjs.h"
 
@@ -191,8 +192,8 @@
 // Inlined scalar macros in loops
 #include "bli_adds_mxn.h"
 #include "bli_adds_mxn_uplo.h"
-#include "bli_copys_mxn.h"
 #include "bli_set0s_mxn.h"
+#include "bli_copys_mxn.h"
 #include "bli_xpbys_mxn.h"
 #include "bli_xpbys_mxn_uplo.h"
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index ca1dfa41e..1e5b14a26 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -257,6 +257,10 @@ typedef dcomplex  f77_dcomplex;
            - 1 == Hermitian
            - 2 == symmetric
            - 3 == triangular
+  31 ~ 29  Execution numerical datatype
+           - 29: domain    (0 == real, 1 == complex)
+           - 30: precision (0 == single, 1 == double)
+           - 31: used to encode integer, constant types
 */
 
 #define BLIS_DATATYPE_SHIFT                0
@@ -286,6 +290,9 @@ typedef dcomplex  f77_dcomplex;
 #define BLIS_PACK_REV_IF_LOWER_SHIFT       24
 #define BLIS_PACK_BUFFER_SHIFT             25
 #define BLIS_STRUC_SHIFT                   27
+#define BLIS_COMP_DT_SHIFT                 29
+#define   BLIS_COMP_DOMAIN_SHIFT           29
+#define   BLIS_COMP_PREC_SHIFT             30
 
 //
 // -- BLIS info bit field masks ------------------------------------------------
@@ -318,6 +325,9 @@ typedef dcomplex  f77_dcomplex;
 #define BLIS_PACK_REV_IF_LOWER_BIT         ( 0x1  << BLIS_PACK_REV_IF_LOWER_SHIFT )
 #define BLIS_PACK_BUFFER_BITS              ( 0x3  << BLIS_PACK_BUFFER_SHIFT )
 #define BLIS_STRUC_BITS                    ( 0x3  << BLIS_STRUC_SHIFT )
+#define BLIS_COMP_DT_BITS                  ( 0x7  << BLIS_COMP_DT_SHIFT )
+#define   BLIS_COMP_DOMAIN_BIT             ( 0x1  << BLIS_COMP_DOMAIN_SHIFT )
+#define   BLIS_COMP_PREC_BIT               ( 0x1  << BLIS_COMP_PREC_SHIFT )
 
 
 //
@@ -603,13 +613,15 @@ typedef enum
 
 typedef enum
 {
-	BLIS_3MH = 0,
+	BLIS_3MH       = 0,
 	BLIS_3M1,
 	BLIS_4MH,
 	BLIS_4M1B,
 	BLIS_4M1A,
 	BLIS_1M,
-	BLIS_NAT
+	BLIS_NAT,
+	BLIS_IND_FIRST = 0,
+	BLIS_IND_LAST  = BLIS_NAT
 } ind_t;
 
 #define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
@@ -1003,7 +1015,7 @@ typedef struct
 	inc_t  is_b;
 
 	// The type to convert to on output.
-	num_t  dt_on_output;
+	//num_t  dt_on_output;
 
 } auxinfo_t;
 
diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h
index 0522410be..fcb1f4324 100644
--- a/frame/include/bli_x86_asm_macros.h
+++ b/frame/include/bli_x86_asm_macros.h
@@ -671,12 +671,68 @@
 #define MOVD(_0, _1) INSTR_(movd, _0, _1)
 #define MOVL(_0, _1) INSTR_(movl, _0, _1)
 #define MOVQ(_0, _1) INSTR_(movq, _0, _1)
+#define CMOVA(_0, _1) INSTR_(cmova, _0, _1)
+#define CMOVAE(_0, _1) INSTR_(cmovae, _0, _1)
+#define CMOVB(_0, _1) INSTR_(cmovb, _0, _1)
+#define CMOVBE(_0, _1) INSTR_(cmovbe, _0, _1)
+#define CMOVC(_0, _1) INSTR_(cmovc, _0, _1)
+#define CMOVP(_0, _1) INSTR_(cmovp, _0, _1)
+#define CMOVO(_0, _1) INSTR_(cmovo, _0, _1)
+#define CMOVS(_0, _1) INSTR_(cmovs, _0, _1)
+#define CMOVE(_0, _1) INSTR_(cmove, _0, _1)
+#define CMOVZ(_0, _1) INSTR_(cmovz, _0, _1)
+#define CMOVG(_0, _1) INSTR_(cmovg, _0, _1)
+#define CMOVGE(_0, _1) INSTR_(cmovge, _0, _1)
+#define CMOVL(_0, _1) INSTR_(cmovl, _0, _1)
+#define CMOVLE(_0, _1) INSTR_(cmovle, _0, _1)
+#define CMOVNA(_0, _1) INSTR_(cmovna, _0, _1)
+#define CMOVNAE(_0, _1) INSTR_(cmovnae, _0, _1)
+#define CMOVNB(_0, _1) INSTR_(cmovnb, _0, _1)
+#define CMOVNBE(_0, _1) INSTR_(cmovnbe, _0, _1)
+#define CMOVNC(_0, _1) INSTR_(cmovnc, _0, _1)
+#define CMOVNP(_0, _1) INSTR_(cmovnp, _0, _1)
+#define CMOVNO(_0, _1) INSTR_(cmovno, _0, _1)
+#define CMOVNS(_0, _1) INSTR_(cmovns, _0, _1)
+#define CMOVNE(_0, _1) INSTR_(cmovne, _0, _1)
+#define CMOVNZ(_0, _1) INSTR_(cmovnz, _0, _1)
+#define CMOVNG(_0, _1) INSTR_(cmovng, _0, _1)
+#define CMOVNGE(_0, _1) INSTR_(cmovnge, _0, _1)
+#define CMOVNL(_0, _1) INSTR_(cmovnl, _0, _1)
+#define CMOVNLE(_0, _1) INSTR_(cmovnle, _0, _1)
 
 #define lea(_0, _1) LEA(_0, _1)
 #define mov(_0, _1) MOV(_0, _1)
 #define movd(_0, _1) MOVD(_0, _1)
 #define movl(_0, _1) MOVL(_0, _1)
 #define movq(_0, _1) MOVQ(_0, _1)
+#define cmova(_0, _1) CMOVA(_0, _1)
+#define cmovae(_0, _1) CMOVAE(_0, _1)
+#define cmovb(_0, _1) CMOVB(_0, _1)
+#define cmovbe(_0, _1) CMOVBE(_0, _1)
+#define cmovc(_0, _1) CMOVC(_0, _1)
+#define cmovp(_0, _1) CMOVP(_0, _1)
+#define cmovo(_0, _1) CMOVO(_0, _1)
+#define cmovs(_0, _1) CMOVS(_0, _1)
+#define cmove(_0, _1) CMOVE(_0, _1)
+#define cmovz(_0, _1) CMOVZ(_0, _1)
+#define cmovg(_0, _1) CMOVG(_0, _1)
+#define cmovge(_0, _1) CMOVGE(_0, _1)
+#define cmovl(_0, _1) CMOVL(_0, _1)
+#define cmovle(_0, _1) CMOVLE(_0, _1)
+#define cmovna(_0, _1) CMOVNA(_0, _1)
+#define cmovnae(_0, _1) CMOVNAE(_0, _1)
+#define cmovnb(_0, _1) CMOVNB(_0, _1)
+#define cmovnbe(_0, _1) CMOVNBE(_0, _1)
+#define cmovnc(_0, _1) CMOVNC(_0, _1)
+#define cmovnp(_0, _1) CMOVNP(_0, _1)
+#define cmovno(_0, _1) CMOVNO(_0, _1)
+#define cmovns(_0, _1) CMOVNS(_0, _1)
+#define cmovne(_0, _1) CMOVNE(_0, _1)
+#define cmovnz(_0, _1) CMOVNZ(_0, _1)
+#define cmovng(_0, _1) CMOVNG(_0, _1)
+#define cmovnge(_0, _1) CMOVNGE(_0, _1)
+#define cmovnl(_0, _1) CMOVNL(_0, _1)
+#define cmovnle(_0, _1) CMOVNLE(_0, _1)
 
 // Vector moves
 
@@ -1038,6 +1094,28 @@
 #define v4fnmaddss(_0, _1, _2) V4FNMADDSS(_0, _1, _2)
 #define v4fnmaddps(_0, _1, _2) V4FNMADDPS(_0, _1, _2)
 
+// Conversions
+
+#define CVTSS2SD(_0, _1) INSTR_(cvtss2sd, _0, _1)
+#define CVTSD2SS(_0, _1) INSTR_(cvtsd2ss, _0, _1)
+#define CVTPS2PD(_0, _1) INSTR_(cvtps2pd, _0, _1)
+#define CVTPD2PS(_0, _1) INSTR_(cvtpd2ps, _0, _1)
+
+#define cvtss2sd(_0, _1) CVTSS2SD(_0, _1)
+#define cvtsd2ss(_0, _1) CVTSD2SS(_0, _1)
+#define cvtps2pd(_0, _1) CVTPS2PD(_0, _1)
+#define cvtpd2ps(_0, _1) CVTPD2PS(_0, _1)
+
+#define VCVTSS2SD(_0, _1) INSTR_(vcvtss2sd, _0, _1)
+#define VCVTSD2SS(_0, _1) INSTR_(vcvtsd2ss, _0, _1)
+#define VCVTPS2PD(_0, _1) INSTR_(vcvtps2pd, _0, _1)
+#define VCVTPD2PS(_0, _1) INSTR_(vcvtpd2ps, _0, _1)
+
+#define vcvtss2sd(_0, _1) VCVTSS2SD(_0, _1)
+#define vcvtsd2ss(_0, _1) VCVTSD2SS(_0, _1)
+#define vcvtps2pd(_0, _1) VCVTPS2PD(_0, _1)
+#define vcvtpd2ps(_0, _1) VCVTPD2PS(_0, _1)
+
 // Vector shuffles
 
 #define PSHUFD(_0, _1, _2) INSTR_(pshufd, _0, _1, _2)
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 1599d3fbd..4a0b977b3 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -126,6 +126,7 @@ extern "C" {
 #include "bli_setri.h"
 
 #include "bli_castm.h"
+#include "bli_castnzm.h"
 #include "bli_castv.h"
 #include "bli_projm.h"
 #include "bli_projv.h"
diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h
index 0f847bb6f..4cb41952a 100644
--- a/frame/include/level0/1m/bli_set1ms_mxn.h
+++ b/frame/include/level0/1m/bli_set1ms_mxn.h
@@ -37,6 +37,18 @@
 
 // set1ms_mxn
 
+#define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
+{ \
+	/* Include real domain version to facilitate macro-izing mixed-datatype
+	   components of packm. */ \
+}
+
+#define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
+{ \
+	/* Include real domain version to facilitate macro-izing mixed-datatype
+	   components of packm. */ \
+}
+
 #define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
 { \
 	inc_t offm_local = offm; \
diff --git a/frame/include/level0/bli_adds_mxn.h b/frame/include/level0/bli_adds_mxn.h
index 407380e8b..ab06fb362 100644
--- a/frame/include/level0/bli_adds_mxn.h
+++ b/frame/include/level0/bli_adds_mxn.h
@@ -41,62 +41,473 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of y.
 
-#define bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_ssadds( *(x + _i*rs_x + _j*cs_x), \
-	            *(y + _i*rs_y + _j*cs_y) ); \
+
+// xy = ?s
+
+static void bli_ssadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ssadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_ssadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ssadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dsadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dsadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dsadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dsadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_csadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_csadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_csadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zsadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zsadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zsadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_ddadds( *(x + _i*rs_x + _j*cs_x), \
-	            *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?d
+
+static void bli_sdadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sdadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sdadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sdadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_ddadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ddadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_ddadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ddadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cdadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cdadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cdadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zdadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zdadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zdadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_ccadds( *(x + _i*rs_x + _j*cs_x), \
-	            *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?c
+
+static void bli_scadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_scadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_scadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_scadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dcadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dcadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dcadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dcadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ccadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_ccadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ccadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zcadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zcadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zcadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_zzadds( *(x + _i*rs_x + _j*cs_x), \
-	            *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?z
+
+static void bli_szadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_szadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dzadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dzadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_czadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzadds( *(x + ii + jj*cs_x),
+		            *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zzadds( *(x + ii*rs_x + jj),
+		            *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzadds( *(x + ii*rs_x + jj*cs_x),
+		            *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
 
-#define bli_sadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+
+static void bli_sadds_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                         float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_dadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_dadds_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                         double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_cadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                         scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_zadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                         dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
 
+
 #endif
diff --git a/frame/include/level0/bli_copyjnzs.h b/frame/include/level0/bli_copyjnzs.h
new file mode 100644
index 000000000..6bacdeb90
--- /dev/null
+++ b/frame/include/level0/bli_copyjnzs.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_COPYJNZS_H
+#define BLIS_COPYJNZS_H
+
+// copyjnzs
+
+// Notes:
+// - The first char encodes the type of x.
+// - The second char encodes the type of y.
+
+#define bli_sscopyjnzs( x, y )  bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
+#define bli_dscopyjnzs( x, y )  bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
+#define bli_cscopyjnzs( x, y )  bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
+#define bli_zscopyjnzs( x, y )  bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
+
+#define bli_sdcopyjnzs( x, y )  bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_ddcopyjnzs( x, y )  bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_cdcopyjnzs( x, y )  bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_zdcopyjnzs( x, y )  bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
+
+// NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we
+// don't touch the imaginary part of y.
+#define bli_sccopyjnzs( x, y )  bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
+#define bli_dccopyjnzs( x, y )  bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
+#define bli_cccopyjnzs( x, y )  bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
+#define bli_zccopyjnzs( x, y )  bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
+
+// NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we
+// don't touch the imaginary part of y.
+#define bli_szcopyjnzs( x, y )  bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_dzcopyjnzs( x, y )  bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_czcopyjnzs( x, y )  bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_zzcopyjnzs( x, y )  bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
+
+
+#define bli_iicopyjnzs( x, y )  { (y) = ( gint_t ) (x); }
+
+
+#define bli_scopyjnzs( x, y )  bli_sscopyjnzs( x, y )
+#define bli_dcopyjnzs( x, y )  bli_ddcopyjnzs( x, y )
+#define bli_ccopyjnzs( x, y )  bli_cccopyjnzs( x, y )
+#define bli_zcopyjnzs( x, y )  bli_zzcopyjnzs( x, y )
+#define bli_icopyjnzs( x, y )  bli_iicopyjnzs( x, y )
+
+
+#endif
+
diff --git a/frame/include/level0/bli_copynzs.h b/frame/include/level0/bli_copynzs.h
new file mode 100644
index 000000000..860b80e1e
--- /dev/null
+++ b/frame/include/level0/bli_copynzs.h
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_COPYNZS_H
+#define BLIS_COPYNZS_H
+
+// copynzs
+
+// Notes:
+// - The first char encodes the type of x.
+// - The second char encodes the type of y.
+
+#define bli_sscopynzs( x, y )  bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) )
+#define bli_dscopynzs( x, y )  bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) )
+#define bli_cscopynzs( x, y )  bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) )
+#define bli_zscopynzs( x, y )  bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) )
+
+#define bli_sdcopynzs( x, y )  bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_ddcopynzs( x, y )  bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_cdcopynzs( x, y )  bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
+#define bli_zdcopynzs( x, y )  bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
+
+// NOTE: Use of scopyris() is so we don't touch the imaginary part of y.
+#define bli_sccopynzs( x, y )  bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
+#define bli_dccopynzs( x, y )  bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
+#define bli_cccopynzs( x, y )  bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
+#define bli_zccopynzs( x, y )  bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
+
+// NOTE: Use of dcopyris() is so we don't touch the imaginary part of y.
+#define bli_szcopynzs( x, y )  bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_dzcopynzs( x, y )  bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_czcopynzs( x, y )  bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
+#define bli_zzcopynzs( x, y )  bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
+
+
+#define bli_iicopynzs( x, y )  { (y) = ( gint_t ) (x); }
+
+
+#define bli_scopynzs( x, y )  bli_sscopynzs( x, y )
+#define bli_dcopynzs( x, y )  bli_ddcopynzs( x, y )
+#define bli_ccopynzs( x, y )  bli_cccopynzs( x, y )
+#define bli_zcopynzs( x, y )  bli_zzcopynzs( x, y )
+#define bli_icopynzs( x, y )  bli_iicopynzs( x, y )
+
+
+#endif
+
diff --git a/frame/include/level0/bli_copys.h b/frame/include/level0/bli_copys.h
index c5d7d9a41..6dbd047ac 100644
--- a/frame/include/level0/bli_copys.h
+++ b/frame/include/level0/bli_copys.h
@@ -51,32 +51,18 @@
 #define bli_cdcopys( x, y )  bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) )
 #define bli_zdcopys( x, y )  bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) )
 
-#ifndef BLIS_ENABLE_C99_COMPLEX
-
+// NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero.
 #define bli_sccopys( x, y )  bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) )
 #define bli_dccopys( x, y )  bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) )
 #define bli_cccopys( x, y )  bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) )
 #define bli_zccopys( x, y )  bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) )
 
+// NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero.
 #define bli_szcopys( x, y )  bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) )
 #define bli_dzcopys( x, y )  bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) )
 #define bli_czcopys( x, y )  bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) )
 #define bli_zzcopys( x, y )  bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) )
 
-#else // ifdef BLIS_ENABLE_C99_COMPLEX
-
-#define bli_sccopys( x, y )  { (y) = (x); }
-#define bli_dccopys( x, y )  { (y) = (x); }
-#define bli_cccopys( x, y )  { (y) = (x); }
-#define bli_zccopys( x, y )  { (y) = (x); }
-
-#define bli_szcopys( x, y )  { (y) = (x); }
-#define bli_dzcopys( x, y )  { (y) = (x); }
-#define bli_czcopys( x, y )  { (y) = (x); }
-#define bli_zzcopys( x, y )  { (y) = (x); }
-
-#endif // BLIS_ENABLE_C99_COMPLEX
-
 
 #define bli_iicopys( x, y )  { (y) = ( gint_t ) (x); }
 
diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h
index 650ebc95b..dc85756b3 100644
--- a/frame/include/level0/bli_copys_mxn.h
+++ b/frame/include/level0/bli_copys_mxn.h
@@ -41,62 +41,470 @@
 // - The first char encodes the type of x.
 // - The second char encodes the type of y.
 
-#define bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_sscopys( *(x + _i*rs_x + _j*cs_x), \
-	             *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?s
+
+static void bli_sscopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sscopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sscopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sscopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dscopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dscopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dscopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dscopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cscopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cscopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cscopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zscopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zscopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zscopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \
-	             *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?d
+
+static void bli_sdcopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sdcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sdcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sdcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_ddcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ddcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_ddcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_ddcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cdcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cdcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cdcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zdcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zdcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zdcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_cccopys( *(x + _i*rs_x + _j*cs_x), \
-	             *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?c
+
+static void bli_sccopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sccopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sccopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sccopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dccopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dccopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dccopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dccopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cccopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cccopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cccopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zccopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zccopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zccopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	dim_t _i, _j; \
-\
-	for ( _j = 0; _j < n; ++_j ) \
-	for ( _i = 0; _i < m; ++_i ) \
-	bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \
-	             *(y + _i*rs_y + _j*cs_y) ); \
+// xy = ?c
+
+static void bli_szcopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_szcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dzcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dzcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_czcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                           dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzcopys( *(x + ii + jj*cs_x),
+		             *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zzcopys( *(x + ii*rs_x + jj),
+		             *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzcopys( *(x + ii*rs_x + jj*cs_x),
+		             *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
 
-#define bli_scopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_scopys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_dcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_dcopys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_ccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
-#define bli_zcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \
-{ \
-	bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \
+static void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
 }
 
 #endif
diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h
index 93e69a7c1..89e62f379 100644
--- a/frame/include/level0/bli_xpbys_mxn.h
+++ b/frame/include/level0/bli_xpbys_mxn.h
@@ -42,106 +42,605 @@
 // - The second char encodes the type of b.
 // - The third char encodes the type of y.
 
-#define bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_seq0( *beta ) ) \
-	{ \
-		bli_sscopys_mxn( m, n, \
-		                 x, rs_x, cs_x, \
-		                 y, rs_y, cs_y ); \
-	} \
-	else \
-	{ \
-		dim_t _i, _j; \
-\
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \
-		              *(beta), \
-		              *(y + _i*rs_y + _j*cs_y) ); \
-	} \
+
+// xby = ?ss
+
+static void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            float*    restrict beta,
+                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_seq0( *beta ) )
+	{
+		bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sssxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sssxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            float*    restrict beta,
+                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_seq0( *beta ) )
+	{
+		bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dssxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dssxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            float*    restrict beta,
+                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_seq0( *beta ) )
+	{
+		bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cssxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cssxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            float*    restrict beta,
+                                                            float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_seq0( *beta ) )
+	{
+		bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zssxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zssxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_deq0( *beta ) ) \
-	{ \
-		bli_ddcopys_mxn( m, n, \
-		                 x, rs_x, cs_x, \
-		                 y, rs_y, cs_y ); \
-	} \
-	else \
-	{ \
-		dim_t _i, _j; \
-\
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \
-		              *(beta), \
-		              *(y + _i*rs_y + _j*cs_y) ); \
-	} \
+// xby = ?dd
+
+static void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            double*   restrict beta,
+                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_deq0( *beta ) )
+	{
+		bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sddxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sddxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            double*   restrict beta,
+                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_deq0( *beta ) )
+	{
+		bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dddxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dddxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            double*   restrict beta,
+                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_deq0( *beta ) )
+	{
+		bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cddxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cddxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            double*   restrict beta,
+                                                            double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_deq0( *beta ) )
+	{
+		bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zddxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zddxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_ceq0( *beta ) ) \
-	{ \
-		bli_cccopys_mxn( m, n, \
-		                 x, rs_x, cs_x, \
-		                 y, rs_y, cs_y ); \
-	} \
-	else \
-	{ \
-		dim_t _i, _j; \
-\
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \
-		              *(beta), \
-		              *(y + _i*rs_y + _j*cs_y) ); \
-	} \
+// xby = ?cc
+
+static void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            scomplex* restrict beta,
+                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_ceq0( *beta ) )
+	{
+		bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sccxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_sccxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            scomplex* restrict beta,
+                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_ceq0( *beta ) )
+	{
+		bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dccxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dccxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            scomplex* restrict beta,
+                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_ceq0( *beta ) )
+	{
+		bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cccxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_cccxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            scomplex* restrict beta,
+                                                            scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_ceq0( *beta ) )
+	{
+		bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zccxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zccxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
-#define bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{ \
-	/* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \
-	if ( bli_zeq0( *beta ) ) \
-	{ \
-		bli_zzcopys_mxn( m, n, \
-		                 x, rs_x, cs_x, \
-		                 y, rs_y, cs_y ); \
-	} \
-	else \
-	{ \
-		dim_t _i, _j; \
-\
-		for ( _j = 0; _j < n; ++_j ) \
-		for ( _i = 0; _i < m; ++_i ) \
-		bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \
-		              *(beta), \
-		              *(y + _i*rs_y + _j*cs_y) ); \
-	} \
+// xby = ?zz
+
+static void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            dcomplex* restrict beta,
+                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_zeq0( *beta ) )
+	{
+		bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szzxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_szzxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            dcomplex* restrict beta,
+                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_zeq0( *beta ) )
+	{
+		bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzzxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_dzzxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            dcomplex* restrict beta,
+                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_zeq0( *beta ) )
+	{
+		bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czzxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_czzxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
+}
+static void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                            dcomplex* restrict beta,
+                                                            dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	// If beta is zero, overwrite y with x (in case y has infs or NaNs).
+	if ( bli_zeq0( *beta ) )
+	{
+		bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_CR_CASES
+	if ( rs_x == 1 && rs_y == 1 )
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzzxpbys( *(x + ii + jj*cs_x), *beta,
+		              *(y + ii + jj*cs_y) );
+	}
+	else if ( cs_x == 1 && cs_y == 1 )
+	{
+		for ( dim_t ii = 0; ii < m; ++ii )
+		for ( dim_t jj = 0; jj < n; ++jj )
+		bli_zzzxpbys( *(x + ii*rs_x + jj), *beta,
+		              *(y + ii*rs_y + jj) );
+	}
+	else
+#endif
+	{
+		for ( dim_t jj = 0; jj < n; ++jj )
+		for ( dim_t ii = 0; ii < m; ++ii )
+		bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta,
+		              *(y + ii*rs_y + jj*cs_y) );
+	}
 }
 
 
-#define bli_sxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+
+static void bli_sxpbys_mxn( const dim_t m, const dim_t n, float*    restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          float*    restrict beta,
+                                                          float*    restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-#define bli_dxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+static void bli_dxpbys_mxn( const dim_t m, const dim_t n, double*   restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          double*   restrict beta,
+                                                          double*   restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-#define bli_cxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+static void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          scomplex* restrict beta,
+                                                          scomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
-#define bli_zxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \
-{\
-	bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \
+static void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x,
+                                                          dcomplex* restrict beta,
+                                                          dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y )
+{
+	bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y );
 }
 
+
 #endif
diff --git a/frame/include/level0/bli_cast.h b/frame/include/level0/old/bli_cast.h
similarity index 100%
rename from frame/include/level0/bli_cast.h
rename to frame/include/level0/old/bli_cast.h
diff --git a/frame/include/level0/ri/bli_copyjris.h b/frame/include/level0/ri/bli_copyjris.h
index 6ca3ab432..910724bbd 100644
--- a/frame/include/level0/ri/bli_copyjris.h
+++ b/frame/include/level0/ri/bli_copyjris.h
@@ -42,5 +42,25 @@
 #define bli_ccopyjris( ar, ai, br, bi )  bli_ccopyris( (ar), -(ai), (br), (bi) )
 #define bli_zcopyjris( ar, ai, br, bi )  bli_zcopyris( (ar), -(ai), (br), (bi) )
 
+#define bli_sscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, 0.0F, br, bi )
+#define bli_dscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, 0.0,  br, bi )
+#define bli_cscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, ai,   br, bi )
+#define bli_zscopyjris( ar, ai, br, bi )  bli_scopyjris( ar, ai,   br, bi )
+
+#define bli_sdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, 0.0F, br, bi )
+#define bli_ddcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, 0.0,  br, bi )
+#define bli_cdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, ai,   br, bi )
+#define bli_zdcopyjris( ar, ai, br, bi )  bli_dcopyjris( ar, ai,   br, bi )
+
+#define bli_sccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, 0.0F, br, bi )
+#define bli_dccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, 0.0,  br, bi )
+#define bli_cccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, ai,   br, bi )
+#define bli_zccopyjris( ar, ai, br, bi )  bli_ccopyjris( ar, ai,   br, bi )
+
+#define bli_szcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, 0.0F, br, bi )
+#define bli_dzcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, 0.0,  br, bi )
+#define bli_czcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, ai,   br, bi )
+#define bli_zzcopyjris( ar, ai, br, bi )  bli_zcopyjris( ar, ai,   br, bi )
+
 #endif
 
diff --git a/frame/include/level0/ri/bli_copyris.h b/frame/include/level0/ri/bli_copyris.h
index b4eef9363..fa7d6b0a6 100644
--- a/frame/include/level0/ri/bli_copyris.h
+++ b/frame/include/level0/ri/bli_copyris.h
@@ -59,5 +59,24 @@
 	(bi) = (ai); \
 }
 
-#endif
+#define bli_sscopyris( ar, ai, br, bi )  bli_scopyris( ar, 0.0F, br, bi )
+#define bli_dscopyris( ar, ai, br, bi )  bli_scopyris( ar, 0.0,  br, bi )
+#define bli_cscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
+#define bli_zscopyris( ar, ai, br, bi )  bli_scopyris( ar, ai,   br, bi )
 
+#define bli_sdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, 0.0F, br, bi )
+#define bli_ddcopyris( ar, ai, br, bi )  bli_dcopyris( ar, 0.0,  br, bi )
+#define bli_cdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
+#define bli_zdcopyris( ar, ai, br, bi )  bli_dcopyris( ar, ai,   br, bi )
+
+#define bli_sccopyris( ar, ai, br, bi )  bli_ccopyris( ar, 0.0F, br, bi )
+#define bli_dccopyris( ar, ai, br, bi )  bli_ccopyris( ar, 0.0,  br, bi )
+#define bli_cccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
+#define bli_zccopyris( ar, ai, br, bi )  bli_ccopyris( ar, ai,   br, bi )
+
+#define bli_szcopyris( ar, ai, br, bi )  bli_zcopyris( ar, 0.0F, br, bi )
+#define bli_dzcopyris( ar, ai, br, bi )  bli_zcopyris( ar, 0.0,  br, bi )
+#define bli_czcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
+#define bli_zzcopyris( ar, ai, br, bi )  bli_zcopyris( ar, ai,   br, bi )
+
+#endif
diff --git a/frame/ind/misc/bli_l3_ind_opt.h b/frame/ind/misc/bli_l3_ind_opt.h
index 862428ae3..9d59a6cb6 100644
--- a/frame/ind/misc/bli_l3_ind_opt.h
+++ b/frame/ind/misc/bli_l3_ind_opt.h
@@ -49,7 +49,8 @@
 \
 	/* If beta is in the real domain, and c is row- or column-stored,
 	   then we may proceed with the optimization. */ \
-	if ( bli_obj_imag_equals( &beta, &BLIS_ZERO ) && \
+	if ( /*bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&*/ \
+	     bli_obj_imag_is_zero( &beta ) && \
 	     !bli_is_gen_stored( rs_c, cs_c ) ) \
 	{ \
 		dt_exec = bli_dt_proj_to_real( dt_exec ); \
diff --git a/sandbox/ref99/blx_gemm_front.c b/sandbox/ref99/blx_gemm_front.c
index d1d56eee8..841cf3153 100644
--- a/sandbox/ref99/blx_gemm_front.c
+++ b/sandbox/ref99/blx_gemm_front.c
@@ -97,19 +97,6 @@ void blx_gemm_front
 		bli_obj_induce_trans( &c_local );
 	}
 
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_GEMM,
-	  BLIS_LEFT, // ignored for gemm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
 	{
 		// A sort of hack for communicating the desired pach schemas for A and
 		// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
@@ -131,6 +118,19 @@ void blx_gemm_front
 		}
 	}
 
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
 	// Invoke the internal back-end via the thread handler.
 	blx_gemm_thread
 	(
diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c
index 2a1cbe6b6..c780489e9 100644
--- a/sandbox/ref99/vars/blx_gemm_ker_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c
@@ -254,6 +254,9 @@ void PASTECH2(blx_,ch,varname) \
 	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* Save the desired output datatype (indicating no typecasting). */ \
+	bli_auxinfo_set_dt_on_output( dt, &aux ); \
 \
 	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
 	dim_t jr_num_threads = bli_thread_n_way( thread ); \
diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile
index 3c2a52124..3dcd6d435 100644
--- a/test/3m4m/Makefile
+++ b/test/3m4m/Makefile
@@ -200,9 +200,9 @@ STR_ST   := -DTHR_STR=\"st\"
 STR_MT   := -DTHR_STR=\"mt\"
 
 # Problem size specification
-PDEF_ST  := -DP_BEGIN=40 \
+PDEF_ST  := -DP_BEGIN=100 \
             -DP_END=2000 \
-            -DP_INC=40
+            -DP_INC=100
 
 PDEF_MT  := -DP_BEGIN=200 \
             -DP_END=10000 \
diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c
index 64311753c..50aeb47ee 100644
--- a/test/3m4m/test_gemm.c
+++ b/test/3m4m/test_gemm.c
@@ -35,9 +35,6 @@
 #include <unistd.h>
 #include "blis.h"
 
-void zgemm3m_( f77_char*, f77_char*, f77_int*, f77_int*, f77_int*, dcomplex*, dcomplex*, f77_int*, dcomplex*, f77_int*, dcomplex*, dcomplex*, f77_int* );
-
-
 //#define PRINT
 
 int main( int argc, char** argv )
@@ -148,9 +145,6 @@ int main( int argc, char** argv )
 		bli_obj_create( dt, m, k, 0, 0, &a );
 		bli_obj_create( dt, k, n, 0, 0, &b );
 		bli_obj_create( dt, m, n, 0, 0, &c );
-		//bli_obj_create( dt, m, k, 2, 2*m, &a );
-		//bli_obj_create( dt, k, n, 2, 2*k, &b );
-		//bli_obj_create( dt, m, n, 2, 2*m, &c );
 		bli_obj_create( dt, m, n, 0, 0, &c_save );
 
 		bli_randm( &a );
@@ -177,7 +171,6 @@ int main( int argc, char** argv )
 		{
 			bli_copym( &c_save, &c );
 
-
 			dtime = bli_clock();
 
 
diff --git a/test/mixeddt/Makefile b/test/mixeddt/Makefile
new file mode 100644
index 000000000..cb9c3484e
--- /dev/null
+++ b/test/mixeddt/Makefile
@@ -0,0 +1,401 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas at Austin nor the names
+#     of its contributors may be used to endorse or promote products
+#     derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# Makefile
+#
+# Field G. Van Zee
+#
+# Makefile for standalone BLIS test drivers.
+#
+
+#
+# --- Makefile PHONY target definitions ----------------------------------------
+#
+
+.PHONY: all all-st all-mt \
+        blis blis-st blis-mt \
+        blis-nat blis-nat-st blis-nat-mt \
+        openblas openblas-st openblas-mt \
+        mkl mkl-st mkl-mt \
+        blis-gemm-st blis-gemm-mt \
+        blis-gemm-nat-st blis-gemm-nat-mt \
+        openblas-gemm-st openblas-gemm-mt \
+        mkl-gemm-st mkl-gemm-mt \
+        clean cleanx
+
+
+
+#
+# --- Determine makefile fragment location -------------------------------------
+#
+
+# Comments:
+# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
+# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
+#   the second case because CONFIG_NAME is not yet set.
+ifneq ($(strip $(BLIS_INSTALL_PATH)),)
+LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
+INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
+SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
+else
+DIST_PATH  := ../..
+LIB_PATH    = ../../lib/$(CONFIG_NAME)
+INC_PATH    = ../../include/$(CONFIG_NAME)
+SHARE_PATH := ../..
+endif
+
+
+
+#
+# --- Include common makefile definitions --------------------------------------
+#
+
+# Include the common makefile fragment.
+-include $(SHARE_PATH)/common.mk
+
+
+
+#
+# --- BLAS and LAPACK implementations ------------------------------------------
+#
+
+# BLIS library and header path. This is simply wherever it was installed.
+#BLIS_LIB_PATH  := $(INSTALL_PREFIX)/lib
+#BLIS_INC_PATH  := $(INSTALL_PREFIX)/include/blis
+
+# BLIS library.
+#BLIS_LIB       := $(BLIS_LIB_PATH)/libblis.a
+
+# BLAS library path(s). This is where the BLAS libraries reside.
+HOME_LIB_PATH  := $(HOME)/flame/lib
+#MKL_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
+#MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
+MKL_LIB_PATH   := ${MKLROOT}/lib/intel64
+#ICC_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
+
+# OpenBLAS
+OPENBLAS_LIB   := $(HOME_LIB_PATH)/libopenblas.a
+OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
+
+# ATLAS
+ATLAS_LIB      := $(HOME_LIB_PATH)/libf77blas.a \
+                  $(HOME_LIB_PATH)/libatlas.a
+
+# MKL
+MKL_LIB        := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_sequential \
+                  -lpthread -lm -ldl
+#MKLP_LIB       := -L$(MKL_LIB_PATH) \
+#                  -lmkl_intel_thread \
+#                  -lmkl_core \
+#                  -lmkl_intel_ilp64 \
+#                  -L$(ICC_LIB_PATH) \
+#                  -liomp5
+MKLP_LIB       := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_gnu_thread \
+                  -lpthread -lm -ldl -fopenmp
+                  #-L$(ICC_LIB_PATH) \
+                  #-lgomp
+
+
+
+#
+# --- General build definitions ------------------------------------------------
+#
+
+TEST_SRC_PATH  := .
+TEST_OBJ_PATH  := .
+
+# Gather all local object files.
+TEST_OBJS      := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
+                                    $(TEST_OBJ_PATH)/%.o, \
+                                    $(wildcard $(TEST_SRC_PATH)/*.c)))
+
+# Override the value of CINCFLAGS so that the value of CFLAGS returned by
+# get-frame-cflags-for() is not cluttered up with include paths needed only
+# while building BLIS.
+CINCFLAGS      := -I$(INC_PATH)
+
+# Use the "framework" CFLAGS for the configuration family.
+CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
+
+# Add local header paths to CFLAGS.
+CFLAGS         += -I$(TEST_SRC_PATH)
+
+# Locate the libblis library to which we will link.
+LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+
+
+# Datatypes for A, B, and C.
+#DTA_S     := -DDTA=BLIS_FLOAT
+#DTA_D     := -DDTA=BLIS_DOUBLE
+#DTA_C     := -DDTA=BLIS_SCOMPLEX
+#DTA_Z     := -DDTA=BLIS_DCOMPLEX
+#
+#DTB_S     := -DDTB=BLIS_FLOAT
+#DTB_D     := -DDTB=BLIS_DOUBLE
+#DTB_C     := -DDTB=BLIS_SCOMPLEX
+#DTB_Z     := -DDTB=BLIS_DCOMPLEX
+#
+#DTC_S     := -DDTC=BLIS_FLOAT
+#DTC_D     := -DDTC=BLIS_DOUBLE
+#DTC_C     := -DDTC=BLIS_SCOMPLEX
+#DTC_Z     := -DDTC=BLIS_DCOMPLEX
+#
+#DTX_S     := -DDTC=BLIS_FLOAT
+#DTX_D     := -DDTC=BLIS_DOUBLE
+
+# Which library?
+BLI_DEF  := -DBLIS
+BLA_DEF  := -DBLAS
+
+# Implementation string
+STR_BLI  := -DSTR=\"asm_blis\"
+STR_OBL  := -DSTR=\"openblas\"
+STR_MKL  := -DSTR=\"mkl\"
+
+# Single or multithreaded string
+STR_ST   := -DTHR_STR=\"st\"
+STR_MT   := -DTHR_STR=\"mt\"
+
+# Problem size specification
+PDEF_ST  := -DP_BEGIN=96 \
+            -DP_END=1200 \
+            -DP_INC=96
+
+PDEF_MT  := -DP_BEGIN=80 \
+            -DP_END=4000 \
+            -DP_INC=80
+
+# Enumerate possible datatypes and computation precisions.
+dts := s d c z
+prs := s d
+
+# Various functions that help us construct the datatype combinations and then
+# extract the needed datatype strings and C preprocessor define flags.
+get-char-c = $(word 1,$(subst _, ,$(1)))
+get-char-a = $(word 2,$(subst _, ,$(1)))
+get-char-b = $(word 3,$(subst _, ,$(1)))
+get-char-x = $(word 4,$(subst _, ,$(1)))
+get-cstr   = $(call get-char-c,$(1))$(call get-char-a,$(1))$(call get-char-b,$(1))$(call get-char-x,$(1))
+
+get-cdef-a = $(strip $(subst s,-DDTA=BLIS_FLOAT, \
+                     $(subst d,-DDTA=BLIS_DOUBLE, \
+                     $(subst c,-DDTA=BLIS_SCOMPLEX, \
+                     $(subst z,-DDTA=BLIS_DCOMPLEX,$(call get-char-a,$(1)))))))
+get-cdef-b = $(strip $(subst s,-DDTB=BLIS_FLOAT, \
+                     $(subst d,-DDTB=BLIS_DOUBLE, \
+                     $(subst c,-DDTB=BLIS_SCOMPLEX, \
+                     $(subst z,-DDTB=BLIS_DCOMPLEX,$(call get-char-b,$(1)))))))
+get-cdef-c = $(strip $(subst s,-DDTC=BLIS_FLOAT, \
+                     $(subst d,-DDTC=BLIS_DOUBLE, \
+                     $(subst c,-DDTC=BLIS_SCOMPLEX, \
+                     $(subst z,-DDTC=BLIS_DCOMPLEX,$(call get-char-c,$(1)))))))
+get-cdef-x = $(strip $(subst s,-DDTX=BLIS_FLOAT, \
+                     $(subst d,-DDTX=BLIS_DOUBLE,$(call get-char-x,$(1)))))
+get-cdefs  = $(call get-cdef-c,$(1)) $(call get-cdef-a,$(1)) $(call get-cdef-b,$(1)) $(call get-cdef-x,$(1))
+
+# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags.
+get-idefs = $(strip $(subst asm_blis,-DSTR=\"$(1)\" -DBLIS, \
+                    $(subst openblas,-DSTR=\"$(1)\" -DBLAS, \
+                    $(subst      mkl,-DSTR=\"$(1)\" -DBLAS,$(1)))))
+
+# Enumerate all possible datatype combinations.
+DT_CODES := $(foreach dt0,$(dts),$(foreach dt1,$(dts),$(foreach dt2,$(dts),$(foreach pr,$(prs),$(dt0)_$(dt1)_$(dt2)_$(pr)))))
+
+# Build a list of the datatype strings.
+DT_COMBOS := $(foreach code,$(DT_CODES),$(call get-cstr,$(code)))
+
+# Build a list of BLIS, OpenBLAS, and MKL executables.
+BLIS_OBJS_ST     := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_st.o)
+BLIS_BINS_ST     := $(patsubst %.o,%.x,$(BLIS_OBJS_ST))
+OPENBLAS_OBJS_ST := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_st.o)
+OPENBLAS_BINS_ST := $(patsubst %.o,%.x,$(OPENBLAS_OBJS_ST))
+
+BLIS_OBJS_MT     := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_mt.o)
+BLIS_BINS_MT     := $(patsubst %.o,%.x,$(BLIS_OBJS_MT))
+OPENBLAS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_mt.o)
+OPENBLAS_BINS_MT := $(patsubst %.o,%.x,$(OPENBLAS_OBJS_MT))
+
+#MKL_OBJS_ST      := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_mkl_st.o)
+
+#BLIS_OBJS_MT     := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_mt.o)
+#OPENBLAS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_mt.o)
+#MKL_OBJS_MT      := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_mkl_mt.o)
+
+
+
+#
+# --- Targets/rules ------------------------------------------------------------
+#
+
+all:         st
+
+st:          blis-st openblas-st
+mt:          blis-mt openblas-mt
+
+blis-st:      $(BLIS_BINS_ST)
+openblas-st:  $(OPENBLAS_BINS_ST)
+blis-mt:      $(BLIS_BINS_MT)
+openblas-mt:  $(OPENBLAS_BINS_MT)
+#blis:        test_ssssgemm_asm_blis_st.x \
+#             test_sssdgemm_asm_blis_st.x \
+#             test_ssdsgemm_asm_blis_st.x \
+#             test_sdssgemm_asm_blis_st.x \
+#             test_dsssgemm_asm_blis_st.x \
+#             test_dddsgemm_asm_blis_st.x \
+#             test_ddddgemm_asm_blis_st.x
+#openblas:    test_ssssgemm_openblas_st.x \
+#             test_sssdgemm_openblas_st.x \
+#             test_ssdsgemm_openblas_st.x \
+#             test_sdssgemm_openblas_st.x \
+#             test_dsssgemm_openblas_st.x \
+#             test_dddsgemm_openblas_st.x \
+#             test_ddddgemm_openblas_st.x
+
+
+# --Object file rules --
+
+# Define the function that will be used to instantiate compilation rules
+# for the various implementations.
+define make-st-rule
+test_$(call get-cstr,$(1))gemm_$(2)_st.o: test_gemm.c Makefile
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(CFLAGS) $(PDEF_ST) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_ST) -c $$< -o $$@
+else
+	@echo "Compiling $$@"
+	@$(CC) $(CFLAGS) $(PDEF_ST) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_ST) -c $$< -o $$@
+endif
+endef
+
+define make-mt-rule
+test_$(call get-cstr,$(1))gemm_$(2)_mt.o: test_gemm.c Makefile
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(CC) $(CFLAGS) $(PDEF_MT) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_MT) -c $$< -o $$@
+else
+	@echo "Compiling $$@"
+	@$(CC) $(CFLAGS) $(PDEF_MT) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_MT) -c $$< -o $$@
+endif
+endef
+
+
+# Define the implementations for which we will instantiate compilation rules.
+IMPLS := asm_blis openblas
+
+# Instantiate the rule function make-st-rule() and make-mt-rule for each
+# implementation in IMPLS and each of the datatype "codes" in DT_CODES.
+$(foreach impl,$(IMPLS), \
+$(foreach code,$(DT_CODES),$(eval $(call make-st-rule,$(code),$(impl)))))
+
+$(foreach impl,$(IMPLS), \
+$(foreach code,$(DT_CODES),$(eval $(call make-mt-rule,$(code),$(impl)))))
+
+
+# -- Executable file rules --
+
+# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
+# on the link command line in case BLIS was configured with the BLAS
+# compatibility layer. This prevents BLIS from inadvertently getting called
+# for the BLAS routines we are trying to test with.
+
+test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(LINKER)  $<  $(OPENBLAS_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	$(RM_F) $<
+else
+	@@echo "Linking $@ to '$(notdir $(OPENBLAS_LIB)) $(LIBBLIS_LINK)'"
+	@$(LINKER) $<  $(OPENBLAS_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	@$(RM_F) $<
+endif
+
+test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(LINKER)  $<  $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	$(RM_F) $<
+else
+	@@echo "Linking $@ to '$(notdir $(OPENBLAS_LIB)) $(LIBBLIS_LINK)'"
+	@$(LINKER) $<  $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	@$(RM_F) $<
+endif
+
+#test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK)
+#ifeq ($(ENABLE_VERBOSE),yes)
+#	$(LINKER)  $<       $(MKL_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+#	$(RM_F) $<
+#else
+#	@@echo "Linking $@ to '$(notdir $(MKL_LIB)) $(LIBBLIS_LINK)'"
+#	@$(LINKER) $<       $(MKL_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+#	@$(RM_F) $<
+#endif
+
+#test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK)
+#	$(LINKER) $<  $(MKLP_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+
+test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(LINKER) $<                   $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	$(RM_F) $<
+else
+	@@echo "Linking $@ to '$(LIBBLIS_LINK)'"
+	@$(LINKER) $<                   $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	@$(RM_F) $<
+endif
+
+test_%_blis_mt.x: test_%_blis_mt.o $(LIBBLIS_LINK)
+ifeq ($(ENABLE_VERBOSE),yes)
+	$(LINKER) $<                   $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	$(RM_F) $<
+else
+	@@echo "Linking $@ to '$(LIBBLIS_LINK)'"
+	@$(LINKER) $<                   $(LIBBLIS_LINK) $(LDFLAGS) -o $@
+	@$(RM_F) $<
+endif
+
+
+# -- Clean rules --
+
+clean: cleanx
+
+cleanx:
+	- $(RM_F) *.o *.x
+
+cleanout:
+	- $(RM_F) *.m
+
+
diff --git a/test/mixeddt/matlab/gemm_md.pdf b/test/mixeddt/matlab/gemm_md.pdf
new file mode 100644
index 000000000..e665aef46
Binary files /dev/null and b/test/mixeddt/matlab/gemm_md.pdf differ
diff --git a/test/mixeddt/matlab/gen_dt_combos.m b/test/mixeddt/matlab/gen_dt_combos.m
new file mode 100644
index 000000000..ee0fe8389
--- /dev/null
+++ b/test/mixeddt/matlab/gen_dt_combos.m
@@ -0,0 +1,165 @@
+function r_val = gen_dt_combos()
+
+dt_chars = [ 's' 'd' 'c' 'z' ];
+pr_chars = [ 's' 'd' ];
+
+if 0
+i = 1;
+for dtc = dt_chars
+	for dta = dt_chars
+		for dtb = dt_chars
+			for pr = pr_chars
+				dt_combos( i, : ) = sprintf( '%c%c%c%c', dtc, dta, dtb, pr );
+				i = i + 1;
+			end
+		end
+	end
+end
+end
+
+%n_combos = size(temp,1);
+
+if 1
+dt_combos(   1, : ) = 'ssss';
+dt_combos(   2, : ) = 'ssds';
+dt_combos(   3, : ) = 'sscs';
+dt_combos(   4, : ) = 'sszs';
+dt_combos(   5, : ) = 'sdss';
+dt_combos(   6, : ) = 'sdds';
+dt_combos(   7, : ) = 'sdcs';
+dt_combos(   8, : ) = 'sdzs';
+dt_combos(   9, : ) = 'sssd';
+dt_combos(  10, : ) = 'ssdd';
+dt_combos(  11, : ) = 'sscd';
+dt_combos(  12, : ) = 'sszd';
+dt_combos(  13, : ) = 'sdsd';
+dt_combos(  14, : ) = 'sddd';
+dt_combos(  15, : ) = 'sdcd';
+dt_combos(  16, : ) = 'sdzd';
+
+dt_combos(  17, : ) = 'scss';
+dt_combos(  18, : ) = 'scds';
+dt_combos(  19, : ) = 'sccs';
+dt_combos(  20, : ) = 'sczs';
+dt_combos(  21, : ) = 'szss';
+dt_combos(  22, : ) = 'szds';
+dt_combos(  23, : ) = 'szcs';
+dt_combos(  24, : ) = 'szzs';
+dt_combos(  25, : ) = 'scsd';
+dt_combos(  26, : ) = 'scdd';
+dt_combos(  27, : ) = 'sccd';
+dt_combos(  28, : ) = 'sczd';
+dt_combos(  29, : ) = 'szsd';
+dt_combos(  30, : ) = 'szdd';
+dt_combos(  31, : ) = 'szcd';
+dt_combos(  32, : ) = 'szzd';
+
+dt_combos(  33, : ) = 'dsss';
+dt_combos(  34, : ) = 'dsds';
+dt_combos(  35, : ) = 'dscs';
+dt_combos(  36, : ) = 'dszs';
+dt_combos(  37, : ) = 'ddss';
+dt_combos(  38, : ) = 'ddds';
+dt_combos(  39, : ) = 'ddcs';
+dt_combos(  40, : ) = 'ddzs';
+dt_combos(  41, : ) = 'dssd';
+dt_combos(  42, : ) = 'dsdd';
+dt_combos(  43, : ) = 'dscd';
+dt_combos(  44, : ) = 'dszd';
+dt_combos(  45, : ) = 'ddsd';
+dt_combos(  46, : ) = 'dddd';
+dt_combos(  47, : ) = 'ddcd';
+dt_combos(  48, : ) = 'ddzd';
+
+dt_combos(  49, : ) = 'dcss';
+dt_combos(  50, : ) = 'dcds';
+dt_combos(  51, : ) = 'dccs';
+dt_combos(  52, : ) = 'dczs';
+dt_combos(  53, : ) = 'dzss';
+dt_combos(  54, : ) = 'dzds';
+dt_combos(  55, : ) = 'dzcs';
+dt_combos(  56, : ) = 'dzzs';
+dt_combos(  57, : ) = 'dcsd';
+dt_combos(  58, : ) = 'dcdd';
+dt_combos(  59, : ) = 'dccd';
+dt_combos(  60, : ) = 'dczd';
+dt_combos(  61, : ) = 'dzsd';
+dt_combos(  62, : ) = 'dzdd';
+dt_combos(  63, : ) = 'dzcd';
+dt_combos(  64, : ) = 'dzzd';
+
+dt_combos(  65, : ) = 'csss';
+dt_combos(  66, : ) = 'csds';
+dt_combos(  67, : ) = 'cscs';
+dt_combos(  68, : ) = 'cszs';
+dt_combos(  69, : ) = 'cdss';
+dt_combos(  70, : ) = 'cdds';
+dt_combos(  71, : ) = 'cdcs';
+dt_combos(  72, : ) = 'cdzs';
+dt_combos(  73, : ) = 'cssd';
+dt_combos(  74, : ) = 'csdd';
+dt_combos(  75, : ) = 'cscd';
+dt_combos(  76, : ) = 'cszd';
+dt_combos(  77, : ) = 'cdsd';
+dt_combos(  78, : ) = 'cddd';
+dt_combos(  79, : ) = 'cdcd';
+dt_combos(  80, : ) = 'cdzd';
+
+dt_combos(  81, : ) = 'ccss';
+dt_combos(  82, : ) = 'ccds';
+dt_combos(  83, : ) = 'cccs';
+dt_combos(  84, : ) = 'cczs';
+dt_combos(  85, : ) = 'czss';
+dt_combos(  86, : ) = 'czds';
+dt_combos(  87, : ) = 'czcs';
+dt_combos(  88, : ) = 'czzs';
+dt_combos(  89, : ) = 'ccsd';
+dt_combos(  90, : ) = 'ccdd';
+dt_combos(  91, : ) = 'cccd';
+dt_combos(  92, : ) = 'cczd';
+dt_combos(  93, : ) = 'czsd';
+dt_combos(  94, : ) = 'czdd';
+dt_combos(  95, : ) = 'czcd';
+dt_combos(  96, : ) = 'czzd';
+
+dt_combos(  97, : ) = 'zsss';
+dt_combos(  98, : ) = 'zsds';
+dt_combos(  99, : ) = 'zscs';
+dt_combos( 100, : ) = 'zszs';
+dt_combos( 101, : ) = 'zdss';
+dt_combos( 102, : ) = 'zdds';
+dt_combos( 103, : ) = 'zdcs';
+dt_combos( 104, : ) = 'zdzs';
+dt_combos( 105, : ) = 'zssd';
+dt_combos( 106, : ) = 'zsdd';
+dt_combos( 107, : ) = 'zscd';
+dt_combos( 108, : ) = 'zszd';
+dt_combos( 109, : ) = 'zdsd';
+dt_combos( 110, : ) = 'zddd';
+dt_combos( 111, : ) = 'zdcd';
+dt_combos( 112, : ) = 'zdzd';
+
+dt_combos( 113, : ) = 'zcss';
+dt_combos( 114, : ) = 'zcds';
+dt_combos( 115, : ) = 'zccs';
+dt_combos( 116, : ) = 'zczs';
+dt_combos( 117, : ) = 'zzss';
+dt_combos( 118, : ) = 'zzds';
+dt_combos( 119, : ) = 'zzcs';
+dt_combos( 120, : ) = 'zzzs';
+dt_combos( 121, : ) = 'zcsd';
+dt_combos( 122, : ) = 'zcdd';
+dt_combos( 123, : ) = 'zccd';
+dt_combos( 124, : ) = 'zczd';
+dt_combos( 125, : ) = 'zzsd';
+dt_combos( 126, : ) = 'zzdd';
+dt_combos( 127, : ) = 'zzcd';
+dt_combos( 128, : ) = 'zzzd';
+end
+
+
+
+
+r_val = dt_combos;
+
+end
diff --git a/test/mixeddt/matlab/plot_all_md.m b/test/mixeddt/matlab/plot_all_md.m
new file mode 100644
index 000000000..9302bdb0a
--- /dev/null
+++ b/test/mixeddt/matlab/plot_all_md.m
@@ -0,0 +1,139 @@
+function r_val = plot_all_md( is_mt )
+
+if is_mt == 1
+	thr_str = 'mt';
+else
+	thr_str = 'st';
+end
+
+if 1
+dt_combos = gen_dt_combos();
+else
+dt_combos( 1, : ) = [ 'ssss' ];
+dt_combos( 2, : ) = [ 'sssd' ];
+dt_combos( 3, : ) = [ 'ssds' ];
+dt_combos( 4, : ) = [ 'sdss' ];
+dt_combos( 5, : ) = [ 'dsss' ];
+dt_combos( 6, : ) = [ 'ddds' ];
+dt_combos( 7, : ) = [ 'dddd' ];
+end
+
+n_combos = size(dt_combos,1);
+
+filetemp_blis = '../output_%s_%sgemm_asm_blis.m';
+filetemp_open = '../output_%s_%sgemm_openblas.m';
+
+% Construct filenames for the "reference" (single real) data, then load
+% the data files, and finally save the results to different variable names.
+file_blis_sref = sprintf( filetemp_blis, thr_str, 'ssss' );
+file_open_sref = sprintf( filetemp_open, thr_str, 'ssss' );
+%str = sprintf( '  Loading %s', file_blis_sref ); disp(str);
+run( file_blis_sref )
+%str = sprintf( '  Loading %s', file_open_sref ); disp(str);
+run( file_open_sref )
+data_gemm_asm_blis_sref( :, : ) = data_gemm_asm_blis( :, : );
+data_gemm_openblas_sref( :, : ) = data_gemm_openblas( :, : );
+
+% Construct filenames for the "reference" (double real) data, then load
+% the data files, and finally save the results to different variable names.
+file_blis_dref = sprintf( filetemp_blis, thr_str, 'dddd' );
+file_open_dref = sprintf( filetemp_open, thr_str, 'dddd' );
+%str = sprintf( '  Loading %s', file_blis_dref ); disp(str);
+run( file_blis_dref )
+%str = sprintf( '  Loading %s', file_open_dref ); disp(str);
+run( file_open_dref )
+data_gemm_asm_blis_dref( :, : ) = data_gemm_asm_blis( :, : );
+data_gemm_openblas_dref( :, : ) = data_gemm_openblas( :, : );
+
+% Construct filenames for the "reference" (single complex) data, then load
+% the data files, and finally save the results to different variable names.
+file_blis_cref = sprintf( filetemp_blis, thr_str, 'cccs' );
+file_open_cref = sprintf( filetemp_open, thr_str, 'cccs' );
+%str = sprintf( '  Loading %s', file_blis_cref ); disp(str);
+run( file_blis_cref )
+%str = sprintf( '  Loading %s', file_open_cref ); disp(str);
+run( file_open_cref )
+data_gemm_asm_blis_cref( :, : ) = data_gemm_asm_blis( :, : );
+data_gemm_openblas_cref( :, : ) = data_gemm_openblas( :, : );
+
+% Construct filenames for the "reference" (double complex) data, then load
+% the data files, and finally save the results to different variable names.
+file_blis_zref = sprintf( filetemp_blis, thr_str, 'zzzd' );
+file_open_zref = sprintf( filetemp_open, thr_str, 'zzzd' );
+%str = sprintf( '  Loading %s', file_blis_zref ); disp(str);
+run( file_blis_zref )
+%str = sprintf( '  Loading %s', file_open_zref ); disp(str);
+run( file_open_zref )
+data_gemm_asm_blis_zref( :, : ) = data_gemm_asm_blis( :, : );
+data_gemm_openblas_zref( :, : ) = data_gemm_openblas( :, : );
+
+fig = figure;
+orient( fig, 'landscape' );
+set(gcf,'Position',[0 0 2000 900]);
+set(gcf,'PaperUnits', 'inches');
+set(gcf,'PaperSize', [64 33]);
+set(gcf,'PaperPosition', [0 0 64 33]);
+%set(gcf,'PaperPositionMode','auto');         
+set(gcf,'PaperPositionMode','manual');         
+set(gcf,'PaperOrientation','landscape');
+
+for dti = 1:n_combos
+%for dti = 1:1
+
+	% Grab the current datatype combination.
+	combo = dt_combos( dti, : );
+
+	str = sprintf( 'Plotting %d: %s', dti, combo ); disp(str);
+
+	if combo(4) == 's'
+		data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_sref( :, : );
+		data_gemm_openblas_ref( :, : ) = data_gemm_openblas_sref( :, : );
+	elseif combo(4) == 'd'
+		data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_dref( :, : );
+		data_gemm_openblas_ref( :, : ) = data_gemm_openblas_dref( :, : );
+	end
+
+	if ( combo(1) == 'c' || combo(1) == 'z' ) && ...
+	   ( combo(2) == 'c' || combo(2) == 'z' ) && ...
+	   ( combo(3) == 'c' || combo(3) == 'z' )
+		if combo(4) == 's'
+			data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_cref( :, : );
+			data_gemm_openblas_ref( :, : ) = data_gemm_openblas_cref( :, : );
+		elseif combo(4) == 'd'
+			data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_zref( :, : );
+			data_gemm_openblas_ref( :, : ) = data_gemm_openblas_zref( :, : );
+		end
+	end
+
+	% Construct filenames for the data files from templates.
+	file_blis = sprintf( filetemp_blis, thr_str, combo );
+	file_open = sprintf( filetemp_open, thr_str, combo );
+
+	% Load the data files.
+	%str = sprintf( '  Loading %s', file_blis ); disp(str);
+	run( file_blis )
+	%str = sprintf( '  Loading %s', file_open ); disp(str);
+	run( file_open )
+
+	% Plot the result.
+	plot_gemm_perf( combo, ...
+	                data_gemm_asm_blis, ...
+	                data_gemm_asm_blis_ref, ...
+	                data_gemm_openblas, ...
+	                data_gemm_openblas_ref, ...
+	                is_mt, dti );
+
+end
+
+
+if 0
+set(gcf,'Position',[0 0 2000 900]);
+set(gcf,'PaperUnits', 'inches');
+set(gcf,'PaperSize', [48 22]);
+set(gcf,'PaperPosition', [0 0 48 22]);
+%set(gcf,'PaperPositionMode','auto');         
+set(gcf,'PaperPositionMode','manual');         
+set(gcf,'PaperOrientation','landscape');
+end
+print(gcf, 'gemm_md','-bestfit','-dpdf');
+%print(gcf, 'gemm_md','-fillpage','-dpdf');
diff --git a/test/mixeddt/matlab/plot_gemm_perf.m b/test/mixeddt/matlab/plot_gemm_perf.m
new file mode 100644
index 000000000..7fc9b0752
--- /dev/null
+++ b/test/mixeddt/matlab/plot_gemm_perf.m
@@ -0,0 +1,181 @@
+function r_val = plot_gemm_perf( dt_str, ...
+                                 data_blis, ...
+                                 data_blis_ref, ...
+                                 data_open, ...
+                                 data_open_ref, ...
+                                 is_mt, ...
+                                 theid )
+
+if 1
+ax1 = subplot( 8, 16, theid );
+hold( ax1, 'on' );
+end
+
+color_blis_ref = 'b'; lines_blis_ref = ':'; markr_blis_ref = '';
+color_open_ref = 'k'; lines_open_ref = ':'; markr_open_ref = 'o';
+color_mkl_ref  = 'r'; lines_mkl_ref  = ':'; markr_mkl_ref  = '.';
+
+color_blis = 'b'; lines_blis = '-'; markr_blis = '';
+color_open = 'k'; lines_open = '-'; markr_open = 'o';
+color_mkl  = 'r'; lines_mkl  = '-'; markr_mkl  = '.';
+
+if dt_str(4) == 's'
+	flopspercycle = 32;
+else
+	flopspercycle = 16;
+end
+
+if is_mt == 1
+	titlename     = '%sgemm';
+	yaxisname     = 'GFLOPS/core';
+	filename_pdf  = 'fig_%sgemm_m1p_k1p_n1p_has_mt_perf.pdf';
+	filename_png  = 'fig_%sgemm_m1p_k1p_n1p_has_mt_perf.png';
+	nth           = 4;
+	x_end         = 4000;
+	max_perf_core = (flopspercycle * 3.6) * 1;
+else
+	titlename     = '%sgemm';
+	yaxisname     = 'GFLOPS';
+	filename_pdf  = 'fig_%sgemm_m1p_k1p_n1p_has_st_perf.pdf';
+	filename_png  = 'fig_%sgemm_m1p_k1p_n1p_has_st_perf.png';
+	nth           = 1;
+	x_end         = 2000;
+	max_perf_core = (flopspercycle * 3.6) * 1;
+end
+
+titlename    = sprintf( titlename, dt_str );
+filename_pdf = sprintf( filename_pdf, dt_str );
+filename_png = sprintf( filename_png, dt_str );
+
+%dt0_str = [ dt_str(4), dt_str(4), dt_str(4), dt_str(4) ];
+dt0_str = dt_str(4);
+
+blis_sref_legend = sprintf( 'BLIS [sc]gemm' );
+blis_dref_legend = sprintf( 'BLIS [dz]gemm' );
+blis_legend      = sprintf( 'BLIS mixed' );
+open_sref_legend = sprintf( 'OBLA [sc]gemm' );
+open_dref_legend = sprintf( 'OBLA [dz]gemm' );
+open_legend      = sprintf( 'OBLA mixed' );
+
+y_scale   = 1.00;
+
+%xaxisname = 'problem size (m = n = k)';
+xaxisname = '     m = n = k';
+
+colorflag = '-rgb';
+
+x_begin = 0;
+
+y_begin = 0;
+y_end   = max_perf_core * y_scale;
+
+flopscol = 4;
+msize = 5;
+if 1
+fontsize = 12;
+else
+fontsize = 16;
+end
+linesize = 0.7;
+legend_loc = 'SouthEast';
+
+% --------------------------------------------------------------------
+
+%fig = figure;
+%hold on; ax1 = gca;
+
+x_axis( :, 1 ) = data_blis( :, 1 );
+
+data_peak( 1, 1:2 ) = [     0 max_perf_core ];
+data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
+
+blis_ref = line( x_axis( :, 1 ), data_blis_ref( :, flopscol ) / nth, ...
+                 'Color',color_blis_ref, 'LineStyle',lines_blis_ref, ...
+                 'LineWidth',linesize );
+blis_md  = line( x_axis( :, 1 ), data_blis( :, flopscol ) / nth, ...
+                 'Color',color_blis, 'LineStyle',lines_blis, ...
+                 'LineWidth',linesize );
+open_ref = line( x_axis( :, 1 ), data_open_ref( :, flopscol ) / nth, ...
+                 'Color',color_open_ref, 'LineStyle',lines_open_ref, ...
+                 'LineWidth',linesize );
+open_md  = line( x_axis( :, 1 ), data_open( :, flopscol ) / nth, ...
+                'Color',color_open, 'LineStyle',lines_open, ...
+                 'LineWidth',linesize );
+%hold on; ax1 = gca;
+                %'Parent',ax1, ...
+
+
+xlim( ax1, [x_begin x_end] );
+ylim( ax1, [y_begin y_end] );
+
+if theid == 1
+leg = legend( ...
+[ ...
+  blis_ref ...
+  blis_md ...
+  open_ref ...
+  open_md ...
+], ...
+blis_sref_legend, ...
+blis_legend, ...
+open_sref_legend, ...
+open_legend, ...
+'Location', 'best' );
+%'Location', legend_loc );
+set( leg,'Box','off' );
+set( leg,'Color','none' );
+set( leg,'FontSize',fontsize-2 );
+set( leg,'Units','inches' );
+elseif theid == 9
+leg = legend( ...
+[ ...
+  blis_ref ...
+  blis_md ...
+  open_ref ...
+  open_md ...
+], ...
+blis_dref_legend, ...
+blis_legend, ...
+open_dref_legend, ...
+open_legend, ...
+'Location', 'best' );
+%'Location', legend_loc );
+set( leg,'Box','off' );
+set( leg,'Color','none' );
+set( leg,'FontSize',fontsize-2 );
+set( leg,'Units','inches' );
+
+end
+
+set( ax1,'FontSize',fontsize );
+set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
+box( ax1, 'on' );
+
+titl = title( titlename );
+set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
+
+tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
+%tpos(1) = tpos(1) + 100;
+tpos(1) = tpos(1) + 40;
+set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
+
+if theid > 112
+xlab = xlabel( ax1,xaxisname );
+%tpos = get( xlab, 'Position' )
+%tpos(2) = tpos(2) + 10;
+%set( xlab, 'Position', tpos );
+end
+
+if mod(theid-1,16) == 0
+ylab = ylabel( ax1,yaxisname );
+end
+
+
+%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
+%saveas( fig, filename_png );
+
+%hold( ax1, 'off' );
+
+r_val = 0;
+
+end
diff --git a/test/mixeddt/matlab/testrand.m b/test/mixeddt/matlab/testrand.m
new file mode 100644
index 000000000..07474711f
--- /dev/null
+++ b/test/mixeddt/matlab/testrand.m
@@ -0,0 +1,44 @@
+fig1 = figure(1);
+clf;
+
+%orient(fig1,'landscape')
+orient(gcf,'landscape')
+
+for i = 1:128
+    subplot(8,16,i);
+    xx = 400:400:2000;
+    aa = rand(size(xx));
+    plot(xx,aa);
+end
+
+% broken.
+if 0
+set(gcf, 'PaperUnits', 'inches');
+set(gcf, 'PaperSize', [60 36]);
+set(fig1,'PaperUnits','normalized');
+set(fig1,'PaperPosition', [0 0 1 1]);
+print(fig1, 'testrand', '-dpdf');
+end
+
+if 0
+% works okay.
+set(gcf,'PaperUnits', 'inches');
+set(gcf,'PaperSize', [72 36]);
+set(gcf,'PaperPositionMode','auto');         
+set(gcf,'PaperOrientation','landscape');
+set(gcf,'Position',[50 50 4000 1800]);
+print(gcf, 'testrand','-bestfit','-dpdf');
+end
+
+if 1
+% works better?
+set(gcf,'Position',[0 0 2000 900]);
+set(gcf,'PaperUnits', 'inches');
+set(gcf,'PaperSize', [48 22]);
+set(gcf,'PaperPosition', [0 0 48 22]);
+%set(gcf,'PaperPositionMode','auto');         
+set(gcf,'PaperPositionMode','manual');         
+set(gcf,'PaperOrientation','landscape');
+print(gcf, 'testrand','-bestfit','-dpdf');
+end
+
diff --git a/test/mixeddt/matlab/testrand.pdf b/test/mixeddt/matlab/testrand.pdf
new file mode 100644
index 000000000..b97c17528
Binary files /dev/null and b/test/mixeddt/matlab/testrand.pdf differ
diff --git a/test/mixeddt/runme.sh b/test/mixeddt/runme.sh
new file mode 100755
index 000000000..2e9967f2b
--- /dev/null
+++ b/test/mixeddt/runme.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# File pefixes.
+exec_root="test"
+out_root="output"
+
+sys="blis"
+#sys="stampede2"
+#sys="lonestar5"
+
+# Bind threads to processors.
+#export OMP_PROC_BIND=true
+#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7"
+#export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7"
+#export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7"
+#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45"
+#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
+export GOMP_CPU_AFFINITY="0 1 2 3"
+
+# Modify LD_LIBRARY_PATH.
+if [ ${sys} = "blis" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH"
+
+elif [ ${sys} = "stampede2" ]; then
+
+	:
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	# A hack to use libiomp5 with gcc.
+	#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
+	:
+
+fi
+
+# Threading scheme to use when multithreading
+if [ ${sys} = "blis" ]; then
+
+	jc_nt=2 # 5th loop
+	ic_nt=2 # 3rd loop
+	jr_nt=1 # 2nd loop
+	ir_nt=1 # 1st loop
+	nt=4
+
+elif [ ${sys} = "stampede2" ]; then
+
+	jc_nt=2 # 5th loop
+	ic_nt=8 # 3rd loop
+	jr_nt=1 # 2nd loop
+	ir_nt=1 # 1st loop
+	nt=16
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	jc_nt=4 # 5th loop
+	ic_nt=6 # 3rd loop
+	jr_nt=1 # 2nd loop
+	ir_nt=1 # 1st loop
+	nt=24
+
+fi
+
+# Complex domain implementations to test.
+if [ ${sys} = "blis" ]; then
+
+	test_impls="openblas asm_blis"
+
+elif [ ${sys} = "stampede2" ]; then
+
+	test_impls="openblas asm_blis mkl"
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	test_impls="openblas mkl asm_blis"
+fi
+
+# Datatypes to test.
+#dts="s d c z"
+
+# Operations to test.
+l3_ops="gemm"
+test_ops="${l3_ops}"
+
+# Define the list of datatype chars and precision chars.
+dt_chars="s d c z"
+pr_chars="s d"
+
+# Construct the datatype combination strings.
+dt_combos=""
+for dtc in ${dt_chars}; do
+	for dta in ${dt_chars}; do
+		for dtb in ${dt_chars}; do
+			for pre in ${pr_chars}; do
+				dt_combos="${dt_combos} ${dtc}${dta}${dtb}${pre}"
+			done
+		done
+	done
+done
+
+# Threadedness to test.
+threads="mt"
+#threads="st"
+
+test_impls="openblas"
+
+#dt_combos="ssss sssd ssds sdss dsss ddds dddd"
+#dt_combos="csss csds cdss cdds zsss zsds zdss zdds cssd csdd cdsd cddd zssd zsdd zdsd zddd"
+#dt_combos="cssd csdd cdsd cddd zsss zsds zdss zdds"
+#dt_combos="cdsd cddd zsss zsds zdss zdds"
+#test_impls="asm_blis"
+
+# Now perform complex test cases.
+for th in ${threads}; do
+
+	for dt in ${dt_combos}; do
+
+		for im in ${test_impls}; do
+
+			for op in ${test_ops}; do
+
+				# Set the number of threads according to th.
+				if [ ${th} = "mt" ]; then
+
+					export BLIS_JC_NT=${jc_nt}
+					export BLIS_IC_NT=${ic_nt}
+					export BLIS_JR_NT=${jr_nt}
+					export BLIS_IR_NT=${ir_nt}
+					export OMP_NUM_THREADS=${nt}
+					export OPENBLAS_NUM_THREADS=${nt}
+
+					# Unset GOMP_CPU_AFFINITY for OpenBLAS, as it causes the library
+					# to execute sequentially.
+					if [ ${im} = "openblas" ]; then
+						unset GOMP_CPU_AFFINITY
+					else
+						export GOMP_CPU_AFFINITY="0 1 2 3"
+					fi
+				else
+
+					export BLIS_JC_NT=1
+					export BLIS_IC_NT=1
+					export BLIS_JR_NT=1
+					export BLIS_IR_NT=1
+					export OMP_NUM_THREADS=1
+					export OPENBLAS_NUM_THREADS=1
+				fi
+
+				# Construct the name of the test executable.
+				exec_name="${exec_root}_${dt}${op}_${im}_${th}.x"
+
+				# Construct the name of the output file.
+				out_file="${out_root}_${th}_${dt}${op}_${im}.m"
+
+				echo "Running (nt = ${OMP_NUM_THREADS}) ./${exec_name} > ${out_file}"
+
+				# Run executable.
+				./${exec_name} > ${out_file}
+
+				#sleep 1
+
+			done
+		done
+	done
+done
diff --git a/test/mixeddt/test_gemm.c b/test/mixeddt/test_gemm.c
new file mode 100644
index 000000000..7be31960d
--- /dev/null
+++ b/test/mixeddt/test_gemm.c
@@ -0,0 +1,580 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+void blas_gemm_md( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c );
+void blas_gemm( trans_t transa, trans_t transb, num_t dt, obj_t* ao, obj_t* alpha, obj_t* bo, obj_t* beta, obj_t* co );
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, b, c;
+	obj_t    c_save;
+	obj_t*   alphao;
+	obj_t*   betao;
+	dim_t    m, n, k;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, n_input, k_input;
+	num_t    dta, dtb, dtc, dtx;
+	char     dta_ch, dtb_ch, dtc_ch;
+	char     dtx_ch;
+	int      r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+	double   flopsmul;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dta       = DTA;
+	dtb       = DTB;
+	dtc       = DTC;
+	dtx       = DTX;
+
+	// Extract the precision component of the computation datatype.
+	prec_t comp_prec = bli_dt_prec( dtx );
+
+	( void )dta_ch;
+	( void )dtb_ch;
+	( void )dtc_ch;
+	( void )dtx_ch;
+
+	p_begin   = P_BEGIN;
+	p_end     = P_END;
+	p_inc     = P_INC;
+
+	m_input   = -1;
+	n_input   = -1;
+	k_input   = -1;
+
+
+#if 0
+	k_input = 256;
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dta ) )    dta_ch = 's';
+	else if ( bli_is_double( dta ) )   dta_ch = 'd';
+	else if ( bli_is_scomplex( dta ) ) dta_ch = 'c';
+	else                               dta_ch = 'z';
+
+	if      ( bli_is_float( dtb ) )    dtb_ch = 's';
+	else if ( bli_is_double( dtb ) )   dtb_ch = 'd';
+	else if ( bli_is_scomplex( dtb ) ) dtb_ch = 'c';
+	else                               dtb_ch = 'z';
+
+	if      ( bli_is_float( dtc ) )    dtc_ch = 's';
+	else if ( bli_is_double( dtc ) )   dtc_ch = 'd';
+	else if ( bli_is_scomplex( dtc ) ) dtc_ch = 'c';
+	else                               dtc_ch = 'z';
+
+	if      ( bli_is_float( dtx ) )    dtx_ch = 's';
+	else                               dtx_ch = 'd';
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+
+	//printf( "data_%s_%c%c%c%cgemm_%s",      THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR );
+	printf( "data_gemm_%s", STR );
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+	// Adjust the flops scaling based on which domain case is being executed.
+	if      (    bli_is_real( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	flopsmul = 2.0;
+	else if (    bli_is_real( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+	flopsmul = 2.0;
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	flopsmul = 2.0;
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+#ifdef BLIS
+	flopsmul = 4.0;
+#else
+	flopsmul = 4.0;  // executes 8.0, but only gets "credit" for 4.0
+#endif
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	flopsmul = 2.0;
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+#ifdef BLIS
+	flopsmul = 4.0;
+#else
+	flopsmul = 4.0;  // executes 8.0, but only gets "credit" for 4.0
+#endif
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	flopsmul = 4.0;
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+	flopsmul = 8.0;
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dta, m, k, 0, 0, &a );
+		bli_obj_create( dtb, k, n, 0, 0, &b );
+		bli_obj_create( dtc, m, n, 0, 0, &c );
+		bli_obj_create( dtc, m, n, 0, 0, &c_save );
+
+		bli_obj_set_comp_prec( comp_prec, &c );
+
+		alphao = &BLIS_ONE;
+		betao  = &BLIS_ONE;
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_copym( &c, &c_save );
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_gemm
+			(
+			  alphao,
+			  &a,
+			  &b,
+			  betao,
+			  &c
+			);
+
+#else
+			blas_gemm_md
+			(
+			  alphao,
+			  &a,
+			  &b,
+			  betao,
+			  &c
+			);
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( flopsmul * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		//printf( "data_%s_%c%c%c%cgemm_%s",      THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR );
+		printf( "data_gemm_%s", STR );
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
+void blas_gemm_md( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c )
+{
+	trans_t transa = bli_obj_conjtrans_status( a );
+	trans_t transb = bli_obj_conjtrans_status( b );
+
+	prec_t  comp_prec = bli_obj_comp_prec( c );
+
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( c ) == ( num_t )comp_prec )
+	{
+		blas_gemm( transa, transb, bli_obj_dt( c ), alpha, a, b, beta, c );
+		return;
+	}
+
+	num_t   dtc = bli_obj_dt( c );
+	num_t   dta = bli_obj_dt( a );
+	num_t   dtb = bli_obj_dt( b );
+
+	dim_t   m = bli_obj_length( c );
+	dim_t   n = bli_obj_width( c );
+	dim_t   k = bli_obj_width_after_trans( a );
+
+	obj_t*  ao = a;
+	obj_t*  bo = b;
+	obj_t*  co = c;
+
+	num_t   targ_dt_c, targ_dt_a, targ_dt_b;
+	dom_t   targ_dom_c, targ_dom_a, targ_dom_b;
+	num_t   dt_comp;
+	dom_t   comp_dom;
+	obj_t   at, bt, ct;
+	obj_t   ar,     cr;
+	bool_t  needacc;
+	bool_t  force_proj_a = FALSE;
+	bool_t  force_proj_b = FALSE;
+
+	
+
+	if      (    bli_is_real( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	{
+		// rrr
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL;
+		needacc = FALSE;
+	}
+	else if (    bli_is_real( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+	{
+		// rrc
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL;
+		needacc = FALSE;
+		force_proj_b = TRUE;
+	}
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	{
+		// rcr
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL;
+		needacc = FALSE;
+		force_proj_a = TRUE;
+	}
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+	{
+		// rcc
+		comp_dom = BLIS_COMPLEX;
+		targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX;
+		needacc = TRUE;
+	}
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	{
+		// crr
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL;
+		needacc = TRUE;
+	}
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+	{
+		// crc
+		comp_dom = BLIS_COMPLEX;
+		targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX;
+		needacc = FALSE;
+		force_proj_a = TRUE;
+	}
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	{
+		// ccr
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_REAL;
+		needacc = FALSE;
+	}
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+	{
+		// ccc
+		comp_dom = BLIS_COMPLEX;
+		targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX;
+		needacc = FALSE;
+	}
+	else
+	{
+		comp_dom = BLIS_REAL;
+		targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL;
+		needacc = FALSE;
+	}
+
+	// ----------------------------------------------------------------------------
+
+
+	// Merge the computation domain with the computation precision.
+	dt_comp = comp_dom | comp_prec;
+
+	targ_dt_a = targ_dom_a | comp_prec;
+	targ_dt_b = targ_dom_b | comp_prec;
+	targ_dt_c = targ_dom_c | comp_prec;
+
+	// Copy-cast A, if needed.
+	if ( bli_dt_prec( dta ) != comp_prec || force_proj_a )
+	{
+		bli_obj_create( targ_dt_a, m, k, 0, 0, &at );
+		bli_castm( ao, &at );
+		ao = &at;
+	}
+
+	// Copy-cast B, if needed.
+	if ( bli_dt_prec( dtb ) != comp_prec || force_proj_b )
+	{
+		bli_obj_create( targ_dt_b, k, n, 0, 0, &bt );
+		bli_castm( bo, &bt );
+		bo = &bt;
+	}
+
+	if ( bli_dt_prec( dtc ) != comp_prec )
+	{
+		needacc = TRUE;
+	}
+
+	// Copy-cast C, if needed.
+	if ( needacc )
+	{
+		//bli_obj_create( dt_comp, m, n, 0, 0, &ct );
+		bli_obj_create( targ_dt_c, m, n, 0, 0, &ct );
+		bli_castm( c, &ct );
+		co = &ct;
+	}
+
+	// ----------------------------------------------------------------------------
+
+	if      (    bli_is_real( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	{
+	}
+	else if (    bli_is_real( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+	{
+	}
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	{
+	}
+	else if (    bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+	{
+	}
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) &&    bli_is_real( dtb ) )
+	{
+	}
+	else if ( bli_is_complex( dtc ) &&    bli_is_real( dta ) && bli_is_complex( dtb ) )
+	{
+	}
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) &&    bli_is_real( dtb ) )
+	{
+		inc_t rsa = bli_obj_row_stride( ao );
+		inc_t csa = bli_obj_col_stride( ao );
+		inc_t ma  = bli_obj_length( ao );
+		inc_t na  = bli_obj_width( ao );
+		siz_t ela = bli_obj_elem_size( ao );
+		num_t dtap = bli_obj_dt_proj_to_real( ao );
+
+		bli_obj_alias_to( ao, &ar ); ao = &ar;
+		bli_obj_set_strides( rsa, 2*csa, ao );
+		bli_obj_set_dims( 2*ma, na, ao );
+		bli_obj_set_dt( dtap, ao );
+		bli_obj_set_elem_size( ela/2, ao );
+
+		inc_t rsc = bli_obj_row_stride( co );
+		inc_t csc = bli_obj_col_stride( co );
+		inc_t mc  = bli_obj_length( co );
+		inc_t nc  = bli_obj_width( co );
+		siz_t elc = bli_obj_elem_size( co );
+		num_t dtcp = bli_obj_dt_proj_to_real( co );
+
+		bli_obj_alias_to( co, &cr ); co = &cr;
+		bli_obj_set_strides( rsc, 2*csc, co );
+		bli_obj_set_dims( 2*mc, nc, co );
+		bli_obj_set_dt( dtcp, co );
+		bli_obj_set_elem_size( elc/2, co );
+	}
+	else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) )
+	{
+	}
+	else
+	{
+	}
+
+	// ----------------------------------------------------------------------------
+
+
+	// Call the BLAS.
+	blas_gemm( transa, transb, dt_comp, alpha, ao, bo, beta, co );
+
+	// Accumulate back to C, if needed.
+	if ( needacc )
+	{
+		bli_castm( &ct, c );
+	}
+
+
+	if ( bli_dt_prec( dta ) != comp_prec || force_proj_a ) { bli_obj_free( &at ); }
+	if ( bli_dt_prec( dtb ) != comp_prec || force_proj_b ) { bli_obj_free( &bt ); }
+	if ( needacc )                                         { bli_obj_free( &ct ); }
+}
+
+void blas_gemm( trans_t transa, trans_t transb, num_t dt, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c )
+{
+	char f77_transa = 'N';
+	char f77_transb = 'N';
+
+	//bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	//bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	if ( bli_is_float( dt ) )
+	{
+		f77_int  mm     = bli_obj_length( c );
+		f77_int  kk     = bli_obj_width_after_trans( a );
+		f77_int  nn     = bli_obj_width( c );
+		f77_int  lda    = bli_obj_col_stride( a );
+		f77_int  ldb    = bli_obj_col_stride( b );
+		f77_int  ldc    = bli_obj_col_stride( c );
+		float*   alphap = bli_obj_buffer_for_1x1( dt, alpha );
+		float*   ap     = bli_obj_buffer( a );
+		float*   bp     = bli_obj_buffer( b );
+		float*   betap  = bli_obj_buffer_for_1x1( dt, beta );
+		float*   cp     = bli_obj_buffer( c );
+
+		sgemm_( &f77_transa,
+		        &f77_transb,
+		        &mm,
+		        &nn,
+		        &kk,
+		        alphap,
+		        ap, &lda,
+		        bp, &ldb,
+		        betap,
+		        cp, &ldc );
+	}
+	else if ( bli_is_double( dt ) )
+	{
+		f77_int  mm     = bli_obj_length( c );
+		f77_int  kk     = bli_obj_width_after_trans( a );
+		f77_int  nn     = bli_obj_width( c );
+		f77_int  lda    = bli_obj_col_stride( a );
+		f77_int  ldb    = bli_obj_col_stride( b );
+		f77_int  ldc    = bli_obj_col_stride( c );
+		double*  alphap = bli_obj_buffer_for_1x1( dt, alpha );
+		double*  ap     = bli_obj_buffer( a );
+		double*  bp     = bli_obj_buffer( b );
+		double*  betap  = bli_obj_buffer_for_1x1( dt, beta );
+		double*  cp     = bli_obj_buffer( c );
+
+		dgemm_( &f77_transa,
+		        &f77_transb,
+		        &mm,
+		        &nn,
+		        &kk,
+		        alphap,
+		        ap, &lda,
+		        bp, &ldb,
+		        betap,
+		        cp, &ldc );
+	}
+	else if ( bli_is_scomplex( dt ) )
+	{
+		f77_int    mm     = bli_obj_length( c );
+		f77_int    kk     = bli_obj_width_after_trans( a );
+		f77_int    nn     = bli_obj_width( c );
+		f77_int    lda    = bli_obj_col_stride( a );
+		f77_int    ldb    = bli_obj_col_stride( b );
+		f77_int    ldc    = bli_obj_col_stride( c );
+		scomplex*  alphap = bli_obj_buffer_for_1x1( dt, alpha );
+		scomplex*  ap     = bli_obj_buffer( a );
+		scomplex*  bp     = bli_obj_buffer( b );
+		scomplex*  betap  = bli_obj_buffer_for_1x1( dt, beta );
+		scomplex*  cp     = bli_obj_buffer( c );
+
+		cgemm_( &f77_transa,
+		        &f77_transb,
+		        &mm,
+		        &nn,
+		        &kk,
+		        alphap,
+		        ap, &lda,
+		        bp, &ldb,
+		        betap,
+		        cp, &ldc );
+	}
+	else if ( bli_is_dcomplex( dt ) )
+	{
+		f77_int    mm     = bli_obj_length( c );
+		f77_int    kk     = bli_obj_width_after_trans( a );
+		f77_int    nn     = bli_obj_width( c );
+		f77_int    lda    = bli_obj_col_stride( a );
+		f77_int    ldb    = bli_obj_col_stride( b );
+		f77_int    ldc    = bli_obj_col_stride( c );
+		dcomplex*  alphap = bli_obj_buffer_for_1x1( dt, alpha );
+		dcomplex*  ap     = bli_obj_buffer( a );
+		dcomplex*  bp     = bli_obj_buffer( b );
+		dcomplex*  betap  = bli_obj_buffer_for_1x1( dt, beta );
+		dcomplex*  cp     = bli_obj_buffer( c );
+
+		zgemm_( &f77_transa,
+		        &f77_transb,
+		        &mm,
+		        &nn,
+		        &kk,
+		        alphap,
+		        ap, &lda,
+		        bp, &ldb,
+		        betap,
+		        cp, &ldc );
+	}
+}
+
diff --git a/testsuite/input.general b/testsuite/input.general
index 601941d83..772840224 100644
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -25,6 +25,8 @@ cj      # Vector storage scheme(s) to test:
 sdcz    # Datatype(s) to test:
         #   's' = single real; 'c' = single complex;
         #   'd' = double real; 'z' = double complex
+0       # Test gemm with mixed-domain operands?
+0       # Test gemm with mixed-precision operands?
 100     # Problem size: first to test
 500     # Problem size: maximum to test
 100     # Problem size: increment between experiments
diff --git a/testsuite/input.general.fast b/testsuite/input.general.fast
index f9de7d099..02b30b897 100644
--- a/testsuite/input.general.fast
+++ b/testsuite/input.general.fast
@@ -25,6 +25,8 @@ cj      # Vector storage scheme(s) to test:
 sdcz    # Datatype(s) to test:
         #   's' = single real; 'c' = single complex;
         #   'd' = double real; 'z' = double complex
+0       # Test gemm with mixed-domain operands?
+0       # Test gemm with mixed-precision operands?
 100     # Problem size: first to test
 100     # Problem size: maximum to test
 100     # Problem size: increment between experiments
diff --git a/testsuite/input.operations b/testsuite/input.operations
index c3e6d6f16..f35e2cd9b 100644
--- a/testsuite/input.operations
+++ b/testsuite/input.operations
@@ -190,6 +190,10 @@
 -1 -2    #   dimensions: m n
 ?        #   parameters: transa
 
+1        # xpbym
+-1 -1    #   dimensions: m n
+?        #   parameters: transa
+
 
 # --- Level-1f kernels -----------------------------------------------------
 
diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c
index bc261d7ae..a6b37864e 100644
--- a/testsuite/src/test_addm.c
+++ b/testsuite/src/test_addm.c
@@ -59,7 +59,7 @@ void libblis_test_addm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -139,7 +139,7 @@ void libblis_test_addm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,6 +150,8 @@ void libblis_test_addm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transx;
@@ -158,6 +160,9 @@ void libblis_test_addm_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c
index eeaa38dd4..bb9a6c60d 100644
--- a/testsuite/src/test_addv.c
+++ b/testsuite/src/test_addv.c
@@ -59,7 +59,7 @@ void libblis_test_addv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -138,7 +138,7 @@ void libblis_test_addv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,6 +149,8 @@ void libblis_test_addv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -157,6 +159,9 @@ void libblis_test_addv_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c
index 0aa5aaeb5..aa2865fad 100644
--- a/testsuite/src/test_amaxv.c
+++ b/testsuite/src/test_amaxv.c
@@ -59,7 +59,7 @@ void libblis_test_amaxv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -142,7 +142,7 @@ void libblis_test_amaxv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -156,12 +156,17 @@ void libblis_test_amaxv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	obj_t        x;
 	obj_t        index;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c
index 6a26d81d4..37c155eea 100644
--- a/testsuite/src/test_axpbyv.c
+++ b/testsuite/src/test_axpbyv.c
@@ -59,7 +59,7 @@ void libblis_test_axpbyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,7 +149,7 @@ void libblis_test_axpbyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -163,6 +163,8 @@ void libblis_test_axpbyv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -171,6 +173,9 @@ void libblis_test_axpbyv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c
index 5a8dc32ae..1e3a610e8 100644
--- a/testsuite/src/test_axpy2v.c
+++ b/testsuite/src/test_axpy2v.c
@@ -59,7 +59,7 @@ void libblis_test_axpy2v_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,7 +149,7 @@ void libblis_test_axpy2v_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -163,6 +163,8 @@ void libblis_test_axpy2v_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx, conjy;
@@ -172,9 +174,13 @@ void libblis_test_axpy2v_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c
index e37f8d4f7..cac2760d3 100644
--- a/testsuite/src/test_axpyf.c
+++ b/testsuite/src/test_axpyf.c
@@ -59,7 +59,7 @@ void libblis_test_axpyf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -147,7 +147,7 @@ void libblis_test_axpyf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -161,6 +161,8 @@ void libblis_test_axpyf_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, b_n;
 
 	conj_t       conja, conjx;
@@ -170,9 +172,13 @@ void libblis_test_axpyf_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c
index e8c15a8e9..240bd4251 100644
--- a/testsuite/src/test_axpym.c
+++ b/testsuite/src/test_axpym.c
@@ -59,7 +59,7 @@ void libblis_test_axpym_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -144,7 +144,7 @@ void libblis_test_axpym_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -158,6 +158,8 @@ void libblis_test_axpym_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transx;
@@ -166,6 +168,9 @@ void libblis_test_axpym_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c
index 1a048c7fa..60a1e3ece 100644
--- a/testsuite/src/test_axpyv.c
+++ b/testsuite/src/test_axpyv.c
@@ -59,7 +59,7 @@ void libblis_test_axpyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -144,7 +144,7 @@ void libblis_test_axpyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -158,6 +158,8 @@ void libblis_test_axpyv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -166,6 +168,9 @@ void libblis_test_axpyv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c
index 2ea6ad7e8..9deebe92a 100644
--- a/testsuite/src/test_copym.c
+++ b/testsuite/src/test_copym.c
@@ -59,7 +59,7 @@ void libblis_test_copym_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -138,7 +138,7 @@ void libblis_test_copym_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,6 +149,8 @@ void libblis_test_copym_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transx;
@@ -156,6 +158,9 @@ void libblis_test_copym_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c
index 62892a793..8b3c3b7d4 100644
--- a/testsuite/src/test_copyv.c
+++ b/testsuite/src/test_copyv.c
@@ -59,7 +59,7 @@ void libblis_test_copyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -138,7 +138,7 @@ void libblis_test_copyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,6 +149,8 @@ void libblis_test_copyv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -156,6 +158,9 @@ void libblis_test_copyv_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c
index ca416f62f..7cb6000af 100644
--- a/testsuite/src/test_dotaxpyv.c
+++ b/testsuite/src/test_dotaxpyv.c
@@ -59,7 +59,7 @@ void libblis_test_dotaxpyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -151,7 +151,7 @@ void libblis_test_dotaxpyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -165,6 +165,8 @@ void libblis_test_dotaxpyv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjxt, conjx, conjy;
@@ -175,9 +177,13 @@ void libblis_test_dotaxpyv_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c
index 3d4fe840c..6cf0d229a 100644
--- a/testsuite/src/test_dotv.c
+++ b/testsuite/src/test_dotv.c
@@ -59,7 +59,7 @@ void libblis_test_dotv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -140,7 +140,7 @@ void libblis_test_dotv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,6 +154,8 @@ void libblis_test_dotv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx, conjy, conjconjxy;
@@ -161,6 +163,9 @@ void libblis_test_dotv_experiment
 	obj_t        x, y, rho;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c
index 3eb0363bf..f59497f28 100644
--- a/testsuite/src/test_dotxaxpyf.c
+++ b/testsuite/src/test_dotxaxpyf.c
@@ -59,7 +59,7 @@ void libblis_test_dotxaxpyf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -157,7 +157,7 @@ void libblis_test_dotxaxpyf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -171,6 +171,8 @@ void libblis_test_dotxaxpyf_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, b_n;
 
 	conj_t       conjat, conja, conjw, conjx;
@@ -180,9 +182,13 @@ void libblis_test_dotxaxpyf_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c
index db92b37d7..12f44c260 100644
--- a/testsuite/src/test_dotxf.c
+++ b/testsuite/src/test_dotxf.c
@@ -59,7 +59,7 @@ void libblis_test_dotxf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,7 +149,7 @@ void libblis_test_dotxf_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -163,6 +163,8 @@ void libblis_test_dotxf_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, b_n;
 
 	conj_t       conjat, conjx;
@@ -172,9 +174,13 @@ void libblis_test_dotxf_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c
index 5033c7504..774706414 100644
--- a/testsuite/src/test_dotxv.c
+++ b/testsuite/src/test_dotxv.c
@@ -59,7 +59,7 @@ void libblis_test_dotxv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -145,7 +145,7 @@ void libblis_test_dotxv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -159,6 +159,8 @@ void libblis_test_dotxv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx, conjy, conjconjxy;
@@ -167,6 +169,9 @@ void libblis_test_dotxv_experiment
 	obj_t        rho_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c
index f3869d814..71ab97449 100644
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -59,7 +59,20 @@ void libblis_test_gemm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     );
+
+void libblis_test_gemm_md
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -89,6 +102,24 @@ void libblis_test_gemm_check
        double*        resid
      );
 
+void libblis_test_gemm_md_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         a,
+       obj_t*         b,
+       obj_t*         beta,
+       obj_t*         c,
+       obj_t*         c_orig,
+       double*        resid
+     );
+
+double libblis_test_gemm_flops
+     (
+       obj_t* a,
+       obj_t* b,
+       obj_t* c
+     );
 
 
 void libblis_test_gemm_deps
@@ -151,7 +182,7 @@ void libblis_test_gemm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -165,6 +196,8 @@ void libblis_test_gemm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n, k;
 
 	trans_t      transa;
@@ -174,6 +207,18 @@ void libblis_test_gemm_experiment
 	obj_t        c_save;
 
 
+	// Use a different function to handle mixed datatypes.
+	if ( params->mixed_domain || params->mixed_precision )
+	{
+		libblis_test_gemm_md( params, op, iface,
+		                      dc_str, pc_str, sc_str,
+		                      p_cur, perf, resid );
+		return;
+	}
+
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
@@ -189,7 +234,7 @@ void libblis_test_gemm_experiment
 
 	// Create test operands (vectors and/or matrices).
 	libblis_test_mobj_create( params, datatype, transa,
-	                          sc_str[0], m, k, &a );
+	                          sc_str[1], m, k, &a );
 	libblis_test_mobj_create( params, datatype, transb,
 	                          sc_str[1], k, n, &b );
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
@@ -249,6 +294,134 @@ void libblis_test_gemm_experiment
 }
 
 
+void libblis_test_gemm_md
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     )
+{
+	unsigned int n_repeats = params->n_repeats;
+	unsigned int i;
+
+	double       time_min  = DBL_MAX;
+	double       time;
+
+	num_t        dt_a, dt_b, dt_c;
+	num_t        dt_complex;
+
+	dim_t        m, n, k;
+
+	trans_t      transa;
+	trans_t      transb;
+
+	obj_t        alpha, a, b, beta, c;
+	obj_t        c_save;
+
+
+	// Decode the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &dt_c );
+	bli_param_map_char_to_blis_dt( dc_str[1], &dt_a );
+	bli_param_map_char_to_blis_dt( dc_str[2], &dt_b );
+
+	// Project one of the datatypes (it doesn't matter which) to the
+	// complex domain.
+	dt_complex = bli_dt_proj_to_complex( dt_c );
+
+	// Map the dimension specifier to actual dimensions.
+	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
+	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
+	k = libblis_test_get_dim_from_prob_size( op->dim_spec[2], p_cur );
+
+	// Map parameter characters to BLIS constants.
+	bli_param_map_char_to_blis_trans( pc_str[0], &transa );
+	bli_param_map_char_to_blis_trans( pc_str[1], &transb );
+
+	// Create test scalars.
+	bli_obj_scalar_init_detached( dt_complex, &alpha );
+	bli_obj_scalar_init_detached( dt_complex, &beta );
+
+	// Create test operands (vectors and/or matrices).
+	libblis_test_mobj_create( params, dt_a, transa,
+	                          sc_str[0], m, k, &a );
+	libblis_test_mobj_create( params, dt_b, transb,
+	                          sc_str[1], k, n, &b );
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
+	                          sc_str[2], m, n, &c );
+	libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE,
+	                          sc_str[2], m, n, &c_save );
+
+	// For mixed-precision, set the computation precision of C.
+	if ( params->mixed_precision )
+	{
+		num_t dt_comp;
+		prec_t comp_prec;
+
+		// The computation precision is encoded in the computation datatype,
+		// which appears as an additional char in dc_str.
+		bli_param_map_char_to_blis_dt( dc_str[3], &dt_comp );
+
+		// Extract the precision from the computation datatype.
+		comp_prec = bli_dt_prec( dt_comp );
+
+		// Set the computation precision of C.
+		bli_obj_set_comp_prec( comp_prec, &c );
+	}
+
+
+	// Set alpha and beta.
+	{
+		bli_setsc(  2.0,  0.0, &alpha );
+		bli_setsc(  1.2,  0.5, &beta );
+	}
+
+	// Randomize A, B, and C, and save C.
+	libblis_test_mobj_randomize( params, TRUE, &a );
+	libblis_test_mobj_randomize( params, TRUE, &b );
+	libblis_test_mobj_randomize( params, TRUE, &c );
+	bli_copym( &c, &c_save );
+
+	// Apply the parameters.
+	bli_obj_set_conjtrans( transa, &a );
+	bli_obj_set_conjtrans( transb, &b );
+
+	// Repeat the experiment n_repeats times and record results. 
+	for ( i = 0; i < n_repeats; ++i )
+	{
+		bli_copym( &c_save, &c );
+
+		time = bli_clock();
+
+		libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
+
+		time_min = bli_clock_min_diff( time_min, time );
+	}
+
+	// Estimate the performance of the best experiment repeat.
+	//*perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
+	//if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
+	*perf = libblis_test_gemm_flops( &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF;
+
+	// Perform checks.
+	libblis_test_gemm_md_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
+
+	// Zero out performance and residual if output matrix is empty.
+	libblis_test_check_empty_problem( &c, perf, resid );
+
+	// Free the test objects.
+	bli_obj_free( &a );
+	bli_obj_free( &b );
+	bli_obj_free( &c );
+	bli_obj_free( &c_save );
+}
+
+
 
 void libblis_test_gemm_impl
      (
@@ -273,6 +446,116 @@ void libblis_test_gemm_impl
 
 
 
+void libblis_test_gemm_md_check
+     (
+       test_params_t* params,
+       obj_t*         alpha,
+       obj_t*         a,
+       obj_t*         b,
+       obj_t*         beta,
+       obj_t*         c,
+       obj_t*         c_orig,
+       double*        resid
+     )
+{
+	num_t  dt_real = bli_obj_dt_proj_to_real( c );
+	num_t  dt_comp = bli_obj_dt_proj_to_complex( c );
+	num_t  dt;
+
+	dim_t  m       = bli_obj_length( c );
+	dim_t  n       = bli_obj_width( c );
+	dim_t  k       = bli_obj_width_after_trans( a );
+
+	obj_t  norm;
+	obj_t  t, v, w, z;
+
+	double junk;
+
+	// Compute our reference checksum in the real domain if all operands
+	// are real, and in the complex domain otherwise. Also implicit in this
+	// is that we use the storage precision of C to determine the precision
+	// in which we perform the reference checksum.
+	if ( bli_obj_is_real( a ) &&
+	     bli_obj_is_real( b ) &&
+	     bli_obj_is_real( c ) ) dt = dt_real;
+	else                        dt = dt_comp;
+
+	// This function works in a manner similar to that of the function
+	// libblis_test_gemm_check(), except that we project a, b, and c into
+	// the complex domain (regardless of their storage datatype), and then
+	// proceed with the checking accordingly.
+
+	obj_t a2, b2, c2, c0;
+
+	bli_obj_scalar_init_detached( dt_real, &norm );
+
+	bli_obj_create( dt, n, 1, 0, 0, &t );
+	bli_obj_create( dt, m, 1, 0, 0, &v );
+	bli_obj_create( dt, k, 1, 0, 0, &w );
+	bli_obj_create( dt, m, 1, 0, 0, &z );
+
+	libblis_test_vobj_randomize( params, TRUE, &t );
+
+	// We need to zero out the imaginary part of t in order for our
+	// checks to work in all cases. Otherwise, the imaginary parts
+	// could affect intermediate products, depending on the order that
+	// they are executed.
+	bli_setiv( &BLIS_ZERO, &t );
+
+	// Create complex equivalents of a, b, c_orig, and c.
+	bli_obj_create( dt, m, k, 0, 0, &a2 );
+	bli_obj_create( dt, k, n, 0, 0, &b2 );
+	bli_obj_create( dt, m, n, 0, 0, &c2 );
+	bli_obj_create( dt, m, n, 0, 0, &c0 );
+
+	// Cast a, b, c_orig, and c into the datatype of our temporary objects.
+	bli_castm( a,      &a2 );
+	bli_castm( b,      &b2 );
+	bli_castm( c_orig, &c2 );
+	bli_castm( c,      &c0 );
+
+	bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v );
+
+#if 0
+if ( bli_obj_is_scomplex( c ) &&
+     bli_obj_is_float( a ) &&
+     bli_obj_is_float( b ) )
+{
+bli_printm( "test_gemm.c: a", a, "%7.3f", "" );
+bli_printm( "test_gemm.c: b", b, "%7.3f", "" );
+bli_printm( "test_gemm.c: c orig", c_orig, "%7.3f", "" );
+bli_printm( "test_gemm.c: c computed", c, "%7.3f", "" );
+}
+#endif
+
+#if 0
+	bli_gemm( alpha, &a2, &b2, beta, &c2 );
+	bli_gemv( &BLIS_ONE, &c2, &t, &BLIS_ZERO, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
+#else
+	bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w );
+	bli_gemv( alpha, &a2, &w, &BLIS_ZERO, &z );
+	bli_gemv( beta, &c2, &t, &BLIS_ONE, &z );
+	if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z );
+#endif
+
+	bli_subv( &z, &v );
+	bli_normfv( &v, &norm );
+	bli_getsc( &norm, resid, &junk );
+
+	bli_obj_free( &t );
+	bli_obj_free( &v );
+	bli_obj_free( &w );
+	bli_obj_free( &z );
+
+	bli_obj_free( &a2 );
+	bli_obj_free( &b2 );
+	bli_obj_free( &c2 );
+	bli_obj_free( &c0 );
+}
+
+
+
 void libblis_test_gemm_check
      (
        test_params_t* params,
@@ -348,3 +631,43 @@ void libblis_test_gemm_check
 	bli_obj_free( &z );
 }
 
+double libblis_test_gemm_flops
+     (
+       obj_t* a,
+       obj_t* b,
+       obj_t* c
+     )
+{
+	bool_t a_is_real    = bli_obj_is_real( a );
+	bool_t a_is_complex = bli_obj_is_complex( a );
+
+	bool_t b_is_real    = bli_obj_is_real( b );
+	bool_t b_is_complex = bli_obj_is_complex( b );
+
+	bool_t c_is_real    = bli_obj_is_real( c );
+	bool_t c_is_complex = bli_obj_is_complex( c );
+
+	double m            = ( double )bli_obj_length( c );
+	double n            = ( double )bli_obj_width( c );
+	double k            = ( double )bli_obj_width( a );
+
+	double flops;
+
+	if      ( ( c_is_complex && a_is_complex && b_is_complex ) )
+	{
+		flops = 8.0 * m * n * k;
+	}
+	else if ( ( c_is_complex && a_is_complex && b_is_real    ) ||
+	          ( c_is_complex && a_is_real    && b_is_complex ) ||
+	          ( c_is_real    && a_is_complex && b_is_complex ) )
+	{
+		flops = 4.0 * m * n * k;
+	}
+	else
+	{
+		flops = 2.0 * m * n * k;
+	}
+
+	return flops;
+}
+
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 4fa0a2f27..f8fcb1224 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -59,7 +59,7 @@ void libblis_test_gemm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -152,7 +152,7 @@ void libblis_test_gemm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -166,6 +166,8 @@ void libblis_test_gemm_ukr_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n, k;
 	inc_t        ldap, ldbp;
 
@@ -178,9 +180,13 @@ void libblis_test_gemm_ukr_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 6fead8c82..351991bb9 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -59,7 +59,7 @@ void libblis_test_gemmtrsm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -171,7 +171,7 @@ void libblis_test_gemmtrsm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -185,6 +185,8 @@ void libblis_test_gemmtrsm_ukr_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n, k;
 	inc_t        ldap, ldbp;
 
@@ -203,9 +205,13 @@ void libblis_test_gemmtrsm_ukr_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c
index 4303cd11e..75a93395c 100644
--- a/testsuite/src/test_gemv.c
+++ b/testsuite/src/test_gemv.c
@@ -59,7 +59,7 @@ void libblis_test_gemv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,7 +148,7 @@ void libblis_test_gemv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -162,6 +162,8 @@ void libblis_test_gemv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transa;
@@ -172,6 +174,9 @@ void libblis_test_gemv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c
index 89e215448..590374089 100644
--- a/testsuite/src/test_ger.c
+++ b/testsuite/src/test_ger.c
@@ -59,7 +59,7 @@ void libblis_test_ger_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -146,7 +146,7 @@ void libblis_test_ger_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -160,6 +160,8 @@ void libblis_test_ger_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	conj_t       conjx, conjy;
@@ -168,6 +170,9 @@ void libblis_test_ger_experiment
 	obj_t        a_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c
index 00942bea9..b370dfd58 100644
--- a/testsuite/src/test_hemm.c
+++ b/testsuite/src/test_hemm.c
@@ -59,7 +59,7 @@ void libblis_test_hemm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,7 +154,7 @@ void libblis_test_hemm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -168,6 +168,8 @@ void libblis_test_hemm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 	dim_t        mn_side;
 
@@ -180,6 +182,9 @@ void libblis_test_hemm_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c
index be985bd32..f940813f7 100644
--- a/testsuite/src/test_hemv.c
+++ b/testsuite/src/test_hemv.c
@@ -59,7 +59,7 @@ void libblis_test_hemv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,7 +149,7 @@ void libblis_test_hemv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -163,6 +163,8 @@ void libblis_test_hemv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -173,6 +175,9 @@ void libblis_test_hemv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c
index 49dadb1c1..534106a7b 100644
--- a/testsuite/src/test_her.c
+++ b/testsuite/src/test_her.c
@@ -59,7 +59,7 @@ void libblis_test_her_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -146,7 +146,7 @@ void libblis_test_her_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -160,6 +160,8 @@ void libblis_test_her_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -169,6 +171,9 @@ void libblis_test_her_experiment
 	obj_t        a_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c
index 040df7e68..e3d731ced 100644
--- a/testsuite/src/test_her2.c
+++ b/testsuite/src/test_her2.c
@@ -59,7 +59,7 @@ void libblis_test_her2_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,7 +148,7 @@ void libblis_test_her2_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -162,6 +162,8 @@ void libblis_test_her2_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -171,6 +173,9 @@ void libblis_test_her2_experiment
 	obj_t        a_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c
index bfc1fa900..b61bdf813 100644
--- a/testsuite/src/test_her2k.c
+++ b/testsuite/src/test_her2k.c
@@ -59,7 +59,7 @@ void libblis_test_her2k_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -152,7 +152,7 @@ void libblis_test_her2k_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -166,6 +166,8 @@ void libblis_test_her2k_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, k;
 
 	uplo_t       uploc;
@@ -175,6 +177,9 @@ void libblis_test_her2k_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c
index 3a68eee7a..f5d2c91f5 100644
--- a/testsuite/src/test_herk.c
+++ b/testsuite/src/test_herk.c
@@ -59,7 +59,7 @@ void libblis_test_herk_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,7 +150,7 @@ void libblis_test_herk_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -164,6 +164,8 @@ void libblis_test_herk_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, k;
 
 	uplo_t       uploc;
@@ -173,6 +175,9 @@ void libblis_test_herk_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index d7f5825be..230b65820 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -50,6 +50,13 @@ char libblis_test_store_chars[ NUM_OPERAND_TYPES ][ MAX_STORE_VALS_PER_TYPE + 1
 
 char libblis_test_param_chars[ NUM_PARAM_TYPES ][ MAX_PARAM_VALS_PER_TYPE + 1 ];
 
+char libblis_test_sp_chars[ 2 + 1 ] = "sc";
+char libblis_test_dp_chars[ 2 + 1 ] = "dz";
+
+char libblis_test_rd_chars[ 2 + 1 ] = "sd";
+char libblis_test_cd_chars[ 2 + 1 ] = "cz";
+
+char libblis_test_dt_chars[ 4 + 1 ] = "sdcz";
 
 
 int main( int argc, char** argv )
@@ -239,6 +246,7 @@ void libblis_test_level1m_ops( thread_data_t* tdata, test_params_t* params, test
 	libblis_test_scal2m( tdata, params, &(ops->scal2m) );
 	libblis_test_setm( tdata, params, &(ops->setm) );
 	libblis_test_subm( tdata, params, &(ops->subm) );
+	libblis_test_xpbym( tdata, params, &(ops->xpbym) );
 }
 
 
@@ -348,6 +356,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops )
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->scal2m) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  0, &(ops->setm) );
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->subm) );
+	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN,  1, &(ops->xpbym) );
 
 	// Level-1f
 	libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M,   2, &(ops->axpy2v) );
@@ -482,14 +491,25 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params
 
 	for( i = 0; i < params->n_datatypes; ++i )
 	{
-		if      ( temp[i] == 's' ) params->datatype[i] = BLIS_FLOAT;
-		else if ( temp[i] == 'd' ) params->datatype[i] = BLIS_DOUBLE;
-		else if ( temp[i] == 'c' ) params->datatype[i] = BLIS_SCOMPLEX;
-		else if ( temp[i] == 'z' ) params->datatype[i] = BLIS_DCOMPLEX;
+		//if      ( temp[i] == 's' ) params->datatype[i] = BLIS_FLOAT;
+		//else if ( temp[i] == 'd' ) params->datatype[i] = BLIS_DOUBLE;
+		//else if ( temp[i] == 'c' ) params->datatype[i] = BLIS_SCOMPLEX;
+		//else if ( temp[i] == 'z' ) params->datatype[i] = BLIS_DCOMPLEX;
+
+		// Map the char in temp[i] to the corresponding num_t value.
+		bli_param_map_char_to_blis_dt( temp[i], &(params->datatype[i]) );
 
 		params->datatype_char[i] = temp[i];
 	}
 
+	// Read whether to test gemm with mixed-domain operands.
+	libblis_test_read_next_line( buffer, input_stream );
+	sscanf( buffer, "%u ", &(params->mixed_domain) );
+
+	// Read whether to test gemm with mixed-precision operands.
+	libblis_test_read_next_line( buffer, input_stream );
+	sscanf( buffer, "%u ", &(params->mixed_precision) );
+
 	// Read the initial problem size to test.
 	libblis_test_read_next_line( buffer, input_stream );
 	sscanf( buffer, "%u ", &(params->p_first) );
@@ -1073,6 +1093,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	for( i = 1; i < params->n_datatypes; ++i )
 	libblis_test_fprintf_c( os, "        [%d]                  %d (%c)\n", i, params->datatype[i],
 	                                                                    params->datatype_char[i] );
+	libblis_test_fprintf_c( os, "mix domains for gemm?        %u\n", params->mixed_domain );
+	libblis_test_fprintf_c( os, "mix precisions for gemm?     %u\n", params->mixed_precision );
 	libblis_test_fprintf_c( os, "problem size: first to test  %u\n", params->p_first );
 	libblis_test_fprintf_c( os, "problem size: max to test    %u\n", params->p_max );
 	libblis_test_fprintf_c( os, "problem size increment       %u\n", params->p_inc );
@@ -1091,6 +1113,29 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "output to stdout AND files?  %u\n", params->output_files );
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf( os, "\n" );
+
+#ifndef BLIS_ENABLE_GEMM_MD
+	// Notify the user if mixed domain or mixed precision was requested.
+	if ( params->mixed_domain || params->mixed_precision )
+	{
+		libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" );
+	}
+#endif
+
+	// If mixed domain or mixed precision was requested, we disable all
+	// induced methods.
+	if ( params->mixed_domain || params->mixed_precision )
+	{
+		ind_t im;
+
+		for ( im = BLIS_IND_FIRST; im < BLIS_IND_LAST+1; ++im )
+		{
+			params->ind_enable[ im ] = 0;
+		}
+
+		// Reenable native execution.
+		params->ind_enable[ BLIS_NAT ] = 1;
+	}
 }
 
 
@@ -1354,23 +1399,26 @@ void carryover( unsigned int* c,
 
 
 
-void libblis_test_op_driver( thread_data_t* tdata,
-                             test_params_t* params,
-                             test_op_t*     op,
-                             iface_t        iface,
-                             char*          op_str,
-                             char*          p_types,
-                             char*          o_types,
-                             thresh_t*      thresh,
-                             void (*f_exp)  (test_params_t*, // params struct
-                                             test_op_t*,     // op struct
-                                             iface_t,        // iface
-                                             num_t,          // datatype (current datatype)
-                                             char*,          // pc_str (current param string)
-                                             char*,          // sc_str (current storage string)
-                                             unsigned int,   // p_cur (current problem size)
-                                             double*,        // perf
-                                             double* ) )     // residual
+void libblis_test_op_driver
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          op_str,
+       char*          p_types,
+       char*          o_types,
+       thresh_t*      thresh,
+       void (*f_exp)  (test_params_t*, // params struct
+                       test_op_t*,     // op struct
+                       iface_t,        // iface
+                       char*,          // datatype (current datatype)
+                       char*,          // pc_str (current param string)
+                       char*,          // sc_str (current storage string)
+                       unsigned int,   // p_cur (current problem size)
+                       double*,        // perf
+                       double* )       // residual
+     )
 {
 	unsigned int  n_mstorage          = params->n_mstorage;
 	unsigned int  n_vstorage          = params->n_vstorage;
@@ -1379,6 +1427,8 @@ void libblis_test_op_driver( thread_data_t* tdata,
 	unsigned int  p_max               = params->p_max;
 	unsigned int  p_inc               = params->p_inc;
 	unsigned int  mix_all_storage     = params->mix_all_storage;
+	unsigned int  mixed_domain        = params->mixed_domain;
+	unsigned int  mixed_precision     = params->mixed_precision;
 	unsigned int  reaction_to_failure = params->reaction_to_failure;
 
 	num_t         datatype;
@@ -1392,12 +1442,28 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 	char          s_spec_str[ MAX_NUM_OPERANDS + 1 ];
 	unsigned int  n_operands;
+	unsigned int  n_operandsp1;
 	char**        chars_for_storage;
 	unsigned int  n_store_combos;
 	char**        sc_str;
 
+	char          d_spec_str[ MAX_NUM_OPERANDS + 1 ];
+	char**        chars_for_spdt;
+	char**        chars_for_dpdt;
+	unsigned int  n_spdt_combos;
+	unsigned int  n_dpdt_combos;
+	unsigned int  n_dt_combos;
+	char**        dc_str;
+
+	char**        chars_for_dt;
+	char**        chars_for_rddt;
+	char**        chars_for_cddt;
+	unsigned int  n_rddt_combos;
+	unsigned int  n_cddt_combos;
+
 	unsigned int  p_cur, pi;
-	unsigned int  dt, indi, pci, sci, i, j, o;
+	unsigned int  indi, pci, sci, dci, i, j, o;
+	unsigned int  is_mixed_dt;
 
 	double        perf, resid;
 	char*         pass_str;
@@ -1411,6 +1477,13 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 	FILE*         output_stream = NULL;
 
+	// These arrays are malloc()'ed in select branches. Here, we set
+	// them to NULL so they can be unconditionally free()'ed at the
+	// end of the function.
+	chars_for_rddt = NULL;
+	chars_for_cddt = NULL;
+	chars_for_spdt = NULL;
+	chars_for_dpdt = NULL;
 
 	// If output to files was requested, attempt to open a file stream.
 	if ( params->output_files )
@@ -1447,11 +1520,8 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 	// Compute the total number of parameter combinations to test (which is
 	// simply the product of the string lengths of chars_for_param[i].
-	for ( i = 0, n_param_combos = 1; i < n_params; ++i )
-	{
-		if ( p_spec_str[i] == '?' )
-			n_param_combos *= strlen( chars_for_param[i] );
-	}
+	n_param_combos = libblis_test_count_combos( n_params, p_spec_str,
+	                                            chars_for_param );
 
 	// Allocate an array of parameter combination strings, one for each
 	// parameter combination that needs to be tested.
@@ -1477,12 +1547,11 @@ void libblis_test_op_driver( thread_data_t* tdata,
 	if ( iface == BLIS_TEST_SEQ_UKERNEL )
 		mix_all_storage = DISABLE;
 
-	// Determine the total number of storage schemes.
+	// Enumerate all combinations of storage schemes requested.
 	if ( mix_all_storage )
 	{
 		// Fill storage specification string with wildcard chars.
-		for ( i = 0; i < n_operands; ++i )
-			s_spec_str[i] = '?';
+		for ( i = 0; i < n_operands; ++i ) s_spec_str[i] = '?';
 		s_spec_str[i] = '\0';
 
 		// Allocate an array that stores pointers to the sets of possible
@@ -1499,11 +1568,8 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 		// Compute the total number of storage combinations to test (which is
 		// simply the product of the string lengths of chars_for_storage[i].
-		for ( i = 0, n_store_combos = 1; i < n_operands; ++i )
-		{
-			if ( s_spec_str[i] == '?' )
-				n_store_combos *= strlen( chars_for_storage[i] );
-		}
+		n_store_combos = libblis_test_count_combos( n_operands, s_spec_str,
+		                                            chars_for_storage );
 
 		// Allocate an array of storage combination strings, one for each
 		// storage combination that needs to be tested.
@@ -1581,6 +1647,300 @@ void libblis_test_op_driver( thread_data_t* tdata,
 		}
 	}
 
+	// Enumerate all combinations of datatype domains requested, but only
+	// for the gemm operation.
+
+	if      ( !mixed_domain &&  mixed_precision && op->opid == BLIS_GEMM )
+	{
+		is_mixed_dt = TRUE;
+
+		// Increment the number of operands by one to account for the
+		// computation precision (or computation datatype, as we will encode
+		// it in the char string).
+		n_operandsp1 = n_operands + 1;
+
+		unsigned int has_rd = libblis_test_dt_str_has_rd_char( params );
+		unsigned int has_cd = libblis_test_dt_str_has_cd_char( params );
+
+		// Fill datatype specification string with wildcard chars.
+		for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?';
+		d_spec_str[i] = '\0';
+
+		// Allocate an array that stores pointers to the sets of possible
+		// datatype chars for each operand.
+		chars_for_rddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
+		chars_for_cddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
+
+		// Set the values in chars_for_rddt/cddt to the address of the string
+		// that holds the datatype chars.
+		for ( i = 0; i < n_operandsp1; ++i )
+		{
+			chars_for_rddt[i] = libblis_test_rd_chars;
+			chars_for_cddt[i] = libblis_test_cd_chars;
+		}
+
+		// Set the last set of chars in chars_for_cddt to the real domain
+		// charset. This is because the last char will be the computation
+		// precision.
+		//chars_for_cddt[i-1] = libblis_test_rd_chars;
+
+		// Compute the total number of datatype combinations to test (which is
+		// simply the product of the string lengths of chars_for_spdt/dpdt[i]).
+		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
+		// we know they are all '?'.
+		n_rddt_combos = 0; n_cddt_combos = 0;
+
+		if ( has_rd )
+			n_rddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
+			                                           chars_for_rddt );
+
+		if ( has_cd )
+			n_cddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
+			                                           chars_for_cddt );
+
+		// Add real and complex domain combinations.
+		n_dt_combos = n_rddt_combos + n_cddt_combos;
+
+		// Allocate an array of datatype combination strings, one for each
+		// datatype combination that needs to be tested.
+		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
+
+		char** dc_str_p = dc_str;
+
+		// Fill the datatype combination strings in dc_str with the datatype
+		// combinations implied by chars_for_rddt/cddt.
+		if ( has_rd )
+		{
+			libblis_test_fill_param_strings( d_spec_str,
+			                                 chars_for_rddt,
+			                                 n_operandsp1,
+			                                 n_rddt_combos,
+			                                 dc_str_p );
+			dc_str_p += n_rddt_combos;
+		}
+		if ( has_cd )
+		{
+			libblis_test_fill_param_strings( d_spec_str,
+			                                 chars_for_cddt,
+			                                 n_operandsp1,
+			                                 n_cddt_combos,
+			                                 dc_str_p );
+			dc_str_p += n_cddt_combos;
+		}
+
+#if 0
+		printf( "n_rddt_combos = %d\n", n_rddt_combos );
+		printf( "n_cddt_combos = %d\n", n_cddt_combos );
+		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
+
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] );
+
+		bli_abort();
+#endif
+	}
+	else if (  mixed_domain && !mixed_precision && op->opid == BLIS_GEMM )
+	{
+		is_mixed_dt = TRUE;
+
+		// Increment the number of operands by one to account for the
+		// computation precision (or computation datatype, as we will encode
+		// it in the char string).
+		n_operandsp1 = n_operands + 1;
+
+		unsigned int has_sp = libblis_test_dt_str_has_sp_char( params );
+		unsigned int has_dp = libblis_test_dt_str_has_dp_char( params );
+
+		// Fill datatype specification string with wildcard chars.
+		for ( i = 0; i < n_operands; ++i ) d_spec_str[i] = '?';
+		d_spec_str[i] = '\0';
+
+		// Allocate an array that stores pointers to the sets of possible
+		// datatype chars for each operand (plus the computation precision
+		// char).
+		chars_for_spdt = ( char** ) malloc( n_operands * sizeof( char* ) );
+		chars_for_dpdt = ( char** ) malloc( n_operands * sizeof( char* ) );
+
+		// Set the values in chars_for_spdt/dpdt to the address of the string
+		// that holds the datatype chars.
+		for ( i = 0; i < n_operands; ++i )
+		{
+			chars_for_spdt[i] = libblis_test_sp_chars;
+			chars_for_dpdt[i] = libblis_test_dp_chars;
+		}
+
+		// Compute the total number of datatype combinations to test (which is
+		// simply the product of the string lengths of chars_for_spdt/dpdt[i]).
+		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
+		// we know they are all '?'.
+		n_spdt_combos = 0; n_dpdt_combos = 0;
+
+		if ( has_sp )
+			n_spdt_combos = libblis_test_count_combos( n_operands, d_spec_str,
+			                                           chars_for_spdt );
+
+		if ( has_dp )
+			n_dpdt_combos = libblis_test_count_combos( n_operands, d_spec_str,
+			                                           chars_for_dpdt );
+
+		// Add single- and double-precision combinations.
+		n_dt_combos = n_spdt_combos + n_dpdt_combos;
+
+		// Allocate an array of datatype combination strings, one for each
+		// datatype combination that needs to be tested.
+		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
+
+		char** dc_str_p = dc_str;
+
+		// Fill the datatype combination strings in dc_str with the datatype
+		// combinations implied by chars_for_spdt/dpdt.
+		if ( has_sp )
+		{
+			libblis_test_fill_param_strings( d_spec_str,
+			                                 chars_for_spdt,
+			                                 n_operands,
+			                                 n_spdt_combos,
+			                                 dc_str_p );
+			dc_str_p += n_spdt_combos;
+		}
+		if ( has_dp )
+		{
+			libblis_test_fill_param_strings( d_spec_str,
+			                                 chars_for_dpdt,
+			                                 n_operands,
+			                                 n_dpdt_combos,
+			                                 dc_str_p );
+			dc_str_p += n_dpdt_combos;
+		}
+
+		// Manually set the computation char to the real projection of the
+		// first char of each combination.
+		for ( i = 0; i < n_dt_combos; ++i )
+		{
+			dc_str[i][3] = libblis_test_proj_dtchar_to_precchar( dc_str[i][0] );
+			dc_str[i][4] = '\0';
+		}
+
+#if 0
+		printf( "n_spdt_combos = %d\n", n_spdt_combos );
+		printf( "n_dpdt_combos = %d\n", n_dpdt_combos );
+		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
+
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] );
+
+		bli_abort();
+#endif
+	}
+	else if (  mixed_domain &&  mixed_precision && op->opid == BLIS_GEMM )
+	{
+		is_mixed_dt = TRUE;
+
+		// Increment the number of operands by one to account for the
+		// computation precision (or computation datatype, as we will encode
+		// it in the char string).
+		n_operandsp1 = n_operands + 1;
+
+		// Fill datatype specification string with wildcard chars.
+		for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?';
+		d_spec_str[i] = '\0';
+
+		// Allocate an array that stores pointers to the sets of possible
+		// datatype chars for each operand.
+		chars_for_dt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) );
+
+		// Set the values in chars_for_rddt/cddt to the address of the string
+		// that holds the datatype chars.
+		for ( i = 0; i < n_operandsp1; ++i )
+		{
+			chars_for_dt[i] = libblis_test_dt_chars;
+		}
+
+		// Set the last set of chars in chars_for_dt to the real domain
+		// charset. This is because the last char will be the computation
+		// precision, with the computation domain implied by the operands'
+		// storage datatypes.
+		chars_for_dt[i-1] = libblis_test_rd_chars;
+
+		// Compute the total number of datatype combinations to test (which is
+		// simply the product of the string lengths of chars_for_dt[i]).
+		// NOTE: We skip inspecting/branching off of the d_spec_str chars since
+		// we know they are all '?'.
+		n_dt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str,
+		                                         chars_for_dt );
+
+		// Allocate an array of datatype combination strings, one for each
+		// datatype combination that needs to be tested.
+		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
+
+		// Fill the datatype combination strings in dc_str with the datatype
+		// combinations implied by chars_for_rddt/cddt.
+		libblis_test_fill_param_strings( d_spec_str,
+		                                 chars_for_dt,
+		                                 n_operandsp1,
+		                                 n_dt_combos,
+		                                 dc_str );
+
+#if 0
+		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
+
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] );
+
+		bli_abort();
+#endif
+	}
+	else // ( ( !mixed_domain && !mixed_precision ) || op->opid != BLIS_GEMM )
+	{
+		is_mixed_dt = FALSE;
+
+		// Increment the number of operands by one to account for the
+		// computation precision (or computation datatype, as we will encode
+		// it in the char string).
+		n_operandsp1 = n_operands + 1;
+
+		// Since we are not mixing domains, we only consider n_datatype
+		// datatype combinations, where each combination is actually
+		// homogeneous (e.g. "sss", "ddd", etc., if n_operands == 3).
+		n_dt_combos = n_datatypes;
+
+		// Allocate an array of datatype combination strings, one for each
+		// datatype specified.
+		dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) );
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) );
+
+		// Fill each datatype combination string with the same dt char for
+		// each operand in the current operation.
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+		{
+			dt_char = params->datatype_char[dci];
+
+			for ( i = 0; i < n_operands; ++i )
+				dc_str[dci][i] = dt_char;
+
+			// Encode the computation precision as the last char.
+			dc_str[dci][i] = libblis_test_proj_dtchar_to_precchar( dc_str[dci][0] );
+
+			dc_str[dci][i+1] = '\0';
+		}
+
+#if 0
+		printf( "n_dt_combos   = %d\n\n", n_dt_combos );
+
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+			printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] );
+
+		bli_abort();
+#endif
+	}
+
 
 
 	// These statements should only be executed by one thread.
@@ -1611,10 +1971,27 @@ void libblis_test_op_driver( thread_data_t* tdata,
 	for ( sci = 0; sci < n_store_combos; ++sci )
 	{
 		// Loop over the requested datatypes.
-		for ( dt = 0; dt < n_datatypes; ++dt )
+		for ( dci = 0; dci < n_dt_combos; ++dci )
+		//for ( dci = 14; dci < 15; ++dci )
+		//for ( dci = 12; dci < 13; ++dci )
+		//for ( dci = 4; dci < 5; ++dci )
+		//for ( dci = 8; dci < 9; ++dci )
+		//for ( dci = 0; dci < 1; ++dci )
 		{
-			datatype = params->datatype[dt];
-			dt_char  = params->datatype_char[dt];
+			// We need a datatype to use for induced method related things
+			// as well as to decide which set of residual thresholds to use.
+			// We must choose the first operand's dt char since that's the
+			// only operand we know is guaranteed to exist.
+			bli_param_map_char_to_blis_dt( dc_str[dci][0], &datatype );
+
+			// If any of the operands are single precision, ensure that
+			// datatype is also single precision.
+			int has_sp = libblis_test_dt_str_has_sp_char_str( n_operandsp1,
+			                                                  dc_str[dci] );
+			if ( has_sp )
+			{
+				datatype = bli_dt_proj_to_single_prec( datatype );
+			}
 
 			// Build a commented column label string.
 			libblis_test_build_col_labels_string( params, op, label_str );
@@ -1680,7 +2057,7 @@ void libblis_test_op_driver( thread_data_t* tdata,
 						f_exp( params,
 						       op,
 						       iface,
-						       datatype,
+						       dc_str[dci],
 						       pc_str[pci],
 						       sc_str[sci],
 						       p_cur,
@@ -1692,18 +2069,22 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 						// Query the string corresponding to the residual's
 						// position relative to the thresholds.
+						// NOTE: Passing in datatype (ie: the value associated
+						// with dc_str[dci][0]) will work, but just barely, since
+						// the numerical thresholds within precisions should be
+						// the same.
 						pass_str = libblis_test_get_string_for_result( resid,
 						                                               datatype,
 						                                               thresh );
 
-						// Build a string unique to the operation, datatype,
-						// parameter combination, and storage combination being
-						// tested.
+						// Build a string unique to the operation, datatype combo,
+						// parameter combo, and storage combo being tested.
 						libblis_test_build_function_string( BLIS_FILEDATA_PREFIX_STR,
 						                                    indi,
 						                                    ind_str,
 						                                    op_str,
-						                                    dt_char,
+						                                    is_mixed_dt,
+						                                    dc_str[dci],
 						                                    n_param_combos,
 						                                    pc_str[pci],
 						                                    sc_str[sci],
@@ -1812,6 +2193,18 @@ void libblis_test_op_driver( thread_data_t* tdata,
 		free( sc_str[sci] );
 	free( sc_str );
 
+	// Free some auxiliary arrays used by the mixed-domain/mixed-precision
+	// datatype-handling logic.
+	free( chars_for_rddt );
+	free( chars_for_cddt );
+	free( chars_for_spdt );
+	free( chars_for_dpdt );
+
+	// Free the datatype combination strings and then the master pointer.
+	for ( dci = 0; dci < n_dt_combos; ++dci )
+		free( dc_str[dci] );
+	free( dc_str );
+
 
 	// If the file was opened (successfully), close the output stream.
 	if ( output_stream )
@@ -1824,17 +2217,27 @@ void libblis_test_op_driver( thread_data_t* tdata,
 
 
 
-void libblis_test_build_function_string( char*        prefix_str,
-                                         ind_t        method,
-                                         char*        ind_str,
-                                         char*        op_str,
-                                         char         dt_char,
-                                         unsigned int n_param_combos,
-                                         char*        pc_str,
-                                         char*        sc_str,
-                                         char*        funcname_str )
+void libblis_test_build_function_string
+     (
+       char*        prefix_str,
+       ind_t        method,
+       char*        ind_str,
+       char*        op_str,
+       unsigned int is_mixed_dt,
+       char*        dc_str,
+       unsigned int n_param_combos,
+       char*        pc_str,
+       char*        sc_str,
+       char*        funcname_str
+     )
 {
-	sprintf( funcname_str, "%s_%c%s", prefix_str, dt_char, op_str );
+	// We only print the full datatype combination string if is_mixed_dt
+	// is set and native execution is begin used. Otherwise, we print only
+	// the first char (since they are all the same).
+	if ( is_mixed_dt == TRUE && method == BLIS_NAT )
+		sprintf( funcname_str, "%s_%s%s", prefix_str, dc_str, op_str );
+	else
+		sprintf( funcname_str, "%s_%c%s", prefix_str, dc_str[0], op_str );
 
 	// If the method is non-native (ie: induced), append a string
 	// identifying the induced method.
@@ -2662,3 +3065,99 @@ int libblis_test_l3_is_disabled( test_op_t* op )
 	if ( op->ops->l3_over == DISABLE ) return TRUE;
 	else                               return FALSE;
 }
+
+// ---
+
+int libblis_test_dt_str_has_sp_char( test_params_t* params )
+{
+	return libblis_test_dt_str_has_sp_char_str( params->n_datatypes,
+	                                            params->datatype_char );
+}
+
+int libblis_test_dt_str_has_sp_char_str( int n, char* str )
+{
+	for ( int i = 0; i < n; ++i )
+	{
+		if ( str[i] == 's' ||
+		     str[i] == 'c' ) return TRUE;
+	}
+
+	return FALSE;
+}
+
+// ---
+
+int libblis_test_dt_str_has_dp_char( test_params_t* params )
+{
+	return libblis_test_dt_str_has_dp_char_str( params->n_datatypes,
+	                                            params->datatype_char );
+}
+
+int libblis_test_dt_str_has_dp_char_str( int n, char* str )
+{
+	for ( int i = 0; i < n; ++i )
+	{
+		if ( str[i] == 'd' ||
+		     str[i] == 'z' ) return TRUE;
+	}
+
+	return FALSE;
+}
+
+// ---
+
+int libblis_test_dt_str_has_rd_char( test_params_t* params )
+{
+	int i;
+
+	for ( i = 0; i < params->n_datatypes; ++i )
+	{
+		if ( params->datatype_char[i] == 's' ||
+		     params->datatype_char[i] == 'd' ) return TRUE;
+	}
+
+	return FALSE;
+}
+
+int libblis_test_dt_str_has_cd_char( test_params_t* params )
+{
+	int i;
+
+	for ( i = 0; i < params->n_datatypes; ++i )
+	{
+		if ( params->datatype_char[i] == 'c' ||
+		     params->datatype_char[i] == 'z' ) return TRUE;
+	}
+
+	return FALSE;
+}
+
+unsigned int libblis_test_count_combos
+     (
+       unsigned int n_operands,
+       char*        spec_str,
+       char**       char_sets
+     )
+{
+	unsigned int n_combos = 1;
+	int i;
+
+	for ( i = 0; i < n_operands; ++i )
+	{
+		if ( spec_str[i] == '?' )
+			n_combos *= strlen( char_sets[i] );
+	}
+
+	return n_combos;
+}
+
+char libblis_test_proj_dtchar_to_precchar( char dt_char )
+{
+	char r_val = dt_char;
+
+	if      ( r_val == 'c' ) r_val = 's';
+	else if ( r_val == 'z' ) r_val = 'd';
+
+	return r_val;
+}
+
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 5b2f2c2e5..020f23549 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -176,6 +176,8 @@ typedef struct
 	unsigned int  n_datatypes;
 	char          datatype_char[ MAX_NUM_DATATYPES + 1 ];
 	num_t         datatype[ MAX_NUM_DATATYPES + 1 ];
+    unsigned int  mixed_domain;
+    unsigned int  mixed_precision;
 	unsigned int  p_first;
 	unsigned int  p_max;
 	unsigned int  p_inc;
@@ -251,6 +253,7 @@ typedef struct test_ops_s
 	test_op_t scal2m;
 	test_op_t setm;
 	test_op_t subm;
+	test_op_t xpbym;
 
 	// level-1f
 	test_op_t axpy2v;
@@ -369,35 +372,42 @@ void carryover( unsigned int* c,
 
 // --- Operation driver ---
 
-void libblis_test_op_driver( thread_data_t* tdata,
-                             test_params_t* params,
-                             test_op_t*     op,
-                             iface_t        iface,
-                             char*          op_str,
-                             char*          p_types,
-                             char*          o_types,
-                             thresh_t*      thresh,
-                             void (*f_exp)  (test_params_t*, // params struct
-                                             test_op_t*,     // op struct
-                                             iface_t,        // iface
-                                             num_t,          // datatype (current datatype)
-                                             char*,          // pc_str (current param string)
-                                             char*,          // sc_str (current storage string)
-                                             unsigned int,   // p_cur (current problem size)
-                                             double*,        // perf
-                                             double* ) );    // residual
+void libblis_test_op_driver
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          op_str,
+       char*          p_types,
+       char*          o_types,
+       thresh_t*      thresh,
+       void (*f_exp)  (test_params_t*, // params struct
+                       test_op_t*,     // op struct
+                       iface_t,        // iface
+                       char*,          // dc_str (current datatype string)
+                       char*,          // pc_str (current param string)
+                       char*,          // sc_str (current storage string)
+                       unsigned int,   // p_cur (current problem size)
+                       double*,        // perf
+                       double*)        // residual
+     );
 
 // --- Generate experiment string labels ---
 
-void libblis_test_build_function_string( char*        prefix_str,
-                                         ind_t        method,
-                                         char*        ind_str,
-                                         char*        op_str,
-                                         char         dt_char,
-                                         unsigned int n_param_combos,
-                                         char*        pc_str,
-                                         char*        sc_str,
-                                         char*        func_str );
+void libblis_test_build_function_string
+     (
+       char*        prefix_str,
+       ind_t        method,
+       char*        ind_str,
+       char*        op_str,
+       unsigned int is_mixed_dt,
+       char*        dc_str,
+       unsigned int n_param_combos,
+       char*        pc_str,
+       char*        sc_str,
+       char*        funcname_str
+     );
 
 void libblis_test_build_dims_string( test_op_t* op,
                                      dim_t      p_cur,
@@ -465,6 +475,21 @@ int libblis_test_l1f_is_disabled( test_op_t* op );
 int libblis_test_l2_is_disabled( test_op_t* op );
 int libblis_test_l3ukr_is_disabled( test_op_t* op );
 int libblis_test_l3_is_disabled( test_op_t* op );
+int libblis_test_dt_str_has_sp_char( test_params_t* params );
+int libblis_test_dt_str_has_sp_char_str( int n, char* str );
+int libblis_test_dt_str_has_dp_char( test_params_t* params );
+int libblis_test_dt_str_has_dp_char_str( int n, char* str );
+int libblis_test_dt_str_has_rd_char( test_params_t* params );
+int libblis_test_dt_str_has_cd_char( test_params_t* params );
+
+unsigned int libblis_test_count_combos
+     (
+       unsigned int n_operands,
+       char*        spec_str,
+       char**       char_sets
+     );
+char libblis_test_proj_dtchar_to_precchar( char dt_char );
+
 
 //
 // --- Test module headers -----------------------------------------------------
@@ -498,6 +523,7 @@ int libblis_test_l3_is_disabled( test_op_t* op );
 #include "test_scal2m.h"
 #include "test_setm.h"
 #include "test_subm.h"
+#include "test_xpbym.h"
 
 // Level-1f kernels
 #include "test_axpy2v.h"
diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c
index 41ecccb7f..7ae052f21 100644
--- a/testsuite/src/test_normfm.c
+++ b/testsuite/src/test_normfm.c
@@ -59,7 +59,7 @@ void libblis_test_normfm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -137,7 +137,7 @@ void libblis_test_normfm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,17 +148,24 @@ void libblis_test_normfm_experiment
 	unsigned int n_repeats = params->n_repeats;
 	unsigned int i;
 
-	num_t        dt_real   = bli_dt_proj_to_real( datatype );
-
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+	num_t        dt_real;
+
 	dim_t        m, n;
 
 	obj_t        beta, norm;
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
+	// Compute the real projection of the chosen datatype.
+	dt_real = bli_dt_proj_to_real( datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c
index 791fa9fc5..ac83481ed 100644
--- a/testsuite/src/test_normfv.c
+++ b/testsuite/src/test_normfv.c
@@ -59,7 +59,7 @@ void libblis_test_normfv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -137,7 +137,7 @@ void libblis_test_normfv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,17 +148,24 @@ void libblis_test_normfv_experiment
 	unsigned int n_repeats = params->n_repeats;
 	unsigned int i;
 
-	num_t        dt_real   = bli_dt_proj_to_real( datatype );
-
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+	num_t        dt_real;
+
 	dim_t        m;
 
 	obj_t        beta, norm;
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
+	// Compute the real projection of the chosen datatype.
+	dt_real = bli_dt_proj_to_real( datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c
index 830440c45..37b437d42 100644
--- a/testsuite/src/test_randm.c
+++ b/testsuite/src/test_randm.c
@@ -59,7 +59,7 @@ void libblis_test_randm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -134,7 +134,7 @@ void libblis_test_randm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          dt,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,6 +148,8 @@ void libblis_test_randm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	char         x_store;
@@ -155,6 +157,9 @@ void libblis_test_randm_experiment
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
@@ -165,7 +170,7 @@ void libblis_test_randm_experiment
 	x_store = sc_str[0];
 
 	// Create the test objects.
-	libblis_test_mobj_create( params, dt, BLIS_NO_TRANSPOSE, x_store, m, n, &x );
+	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, x_store, m, n, &x );
 
 	// Repeat the experiment n_repeats times and record results. 
 	for ( i = 0; i < n_repeats; ++i )
diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c
index 0cccb0b5e..e1bf28fb9 100644
--- a/testsuite/src/test_randv.c
+++ b/testsuite/src/test_randv.c
@@ -59,7 +59,7 @@ void libblis_test_randv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -134,7 +134,7 @@ void libblis_test_randv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,6 +148,8 @@ void libblis_test_randv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	char         x_store;
@@ -155,6 +157,9 @@ void libblis_test_randv_experiment
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c
index 13af3791c..9814326af 100644
--- a/testsuite/src/test_scal2m.c
+++ b/testsuite/src/test_scal2m.c
@@ -59,7 +59,7 @@ void libblis_test_scal2m_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -143,7 +143,7 @@ void libblis_test_scal2m_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -157,6 +157,8 @@ void libblis_test_scal2m_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transx;
@@ -165,6 +167,9 @@ void libblis_test_scal2m_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c
index d2909f937..765bd92ec 100644
--- a/testsuite/src/test_scal2v.c
+++ b/testsuite/src/test_scal2v.c
@@ -59,7 +59,7 @@ void libblis_test_scal2v_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -143,7 +143,7 @@ void libblis_test_scal2v_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -157,6 +157,8 @@ void libblis_test_scal2v_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -165,6 +167,9 @@ void libblis_test_scal2v_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c
index 93130a6d8..adeefacc1 100644
--- a/testsuite/src/test_scalm.c
+++ b/testsuite/src/test_scalm.c
@@ -59,7 +59,7 @@ void libblis_test_scalm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -139,7 +139,7 @@ void libblis_test_scalm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -153,6 +153,8 @@ void libblis_test_scalm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	conj_t       conjbeta;
@@ -161,6 +163,9 @@ void libblis_test_scalm_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c
index d3bd9450e..e276d2a6b 100644
--- a/testsuite/src/test_scalv.c
+++ b/testsuite/src/test_scalv.c
@@ -59,7 +59,7 @@ void libblis_test_scalv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -140,7 +140,7 @@ void libblis_test_scalv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,6 +154,8 @@ void libblis_test_scalv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjbeta;
@@ -162,6 +164,9 @@ void libblis_test_scalv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c
index e2d6336dd..dbedcdea4 100644
--- a/testsuite/src/test_setm.c
+++ b/testsuite/src/test_setm.c
@@ -59,7 +59,7 @@ void libblis_test_setm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -136,7 +136,7 @@ void libblis_test_setm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,12 +150,17 @@ void libblis_test_setm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	obj_t        beta;
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c
index d4f8143f4..f1984f6be 100644
--- a/testsuite/src/test_setv.c
+++ b/testsuite/src/test_setv.c
@@ -59,7 +59,7 @@ void libblis_test_setv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -136,7 +136,7 @@ void libblis_test_setv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,12 +150,17 @@ void libblis_test_setv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	obj_t        beta;
 	obj_t        x;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c
index e6d84a5b3..287f299b1 100644
--- a/testsuite/src/test_subm.c
+++ b/testsuite/src/test_subm.c
@@ -59,7 +59,7 @@ void libblis_test_subm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -139,7 +139,7 @@ void libblis_test_subm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,6 +150,8 @@ void libblis_test_subm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	trans_t      transx;
@@ -158,6 +160,9 @@ void libblis_test_subm_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c
index 636a16e9c..300b054a1 100644
--- a/testsuite/src/test_subv.c
+++ b/testsuite/src/test_subv.c
@@ -59,7 +59,7 @@ void libblis_test_subv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -139,7 +139,7 @@ void libblis_test_subv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,6 +150,8 @@ void libblis_test_subv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -158,6 +160,9 @@ void libblis_test_subv_experiment
 	obj_t        x, y;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c
index 65bda9634..3f2615b2f 100644
--- a/testsuite/src/test_symm.c
+++ b/testsuite/src/test_symm.c
@@ -59,7 +59,7 @@ void libblis_test_symm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,7 +154,7 @@ void libblis_test_symm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -168,6 +168,8 @@ void libblis_test_symm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 	dim_t        mn_side;
 
@@ -180,6 +182,9 @@ void libblis_test_symm_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c
index 82e932d81..7e06388d7 100644
--- a/testsuite/src/test_symv.c
+++ b/testsuite/src/test_symv.c
@@ -59,7 +59,7 @@ void libblis_test_symv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -149,7 +149,7 @@ void libblis_test_symv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -163,6 +163,8 @@ void libblis_test_symv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -173,6 +175,9 @@ void libblis_test_symv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c
index c4a8f45a8..12d1e60f6 100644
--- a/testsuite/src/test_syr.c
+++ b/testsuite/src/test_syr.c
@@ -59,7 +59,7 @@ void libblis_test_syr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -146,7 +146,7 @@ void libblis_test_syr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -160,6 +160,8 @@ void libblis_test_syr_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -169,6 +171,9 @@ void libblis_test_syr_experiment
 	obj_t        a_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c
index 9b2d59098..e28a4fdd0 100644
--- a/testsuite/src/test_syr2.c
+++ b/testsuite/src/test_syr2.c
@@ -59,7 +59,7 @@ void libblis_test_syr2_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -148,7 +148,7 @@ void libblis_test_syr2_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -162,6 +162,8 @@ void libblis_test_syr2_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -171,6 +173,9 @@ void libblis_test_syr2_experiment
 	obj_t        a_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c
index 39405fda5..2aa9754ba 100644
--- a/testsuite/src/test_syr2k.c
+++ b/testsuite/src/test_syr2k.c
@@ -59,7 +59,7 @@ void libblis_test_syr2k_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -152,7 +152,7 @@ void libblis_test_syr2k_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -166,6 +166,8 @@ void libblis_test_syr2k_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, k;
 
 	uplo_t       uploc;
@@ -175,6 +177,9 @@ void libblis_test_syr2k_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c
index 621bd7c81..4e450ad03 100644
--- a/testsuite/src/test_syrk.c
+++ b/testsuite/src/test_syrk.c
@@ -59,7 +59,7 @@ void libblis_test_syrk_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,7 +150,7 @@ void libblis_test_syrk_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -164,6 +164,8 @@ void libblis_test_syrk_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, k;
 
 	uplo_t       uploc;
@@ -173,6 +175,9 @@ void libblis_test_syrk_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c
index a3c245aef..5ee739645 100644
--- a/testsuite/src/test_trmm.c
+++ b/testsuite/src/test_trmm.c
@@ -59,7 +59,7 @@ void libblis_test_trmm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,7 +150,7 @@ void libblis_test_trmm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -164,6 +164,8 @@ void libblis_test_trmm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 	dim_t        mn_side;
 
@@ -176,6 +178,9 @@ void libblis_test_trmm_experiment
 	obj_t        b_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c
index caf8269b5..494c7ef84 100644
--- a/testsuite/src/test_trmm3.c
+++ b/testsuite/src/test_trmm3.c
@@ -59,7 +59,7 @@ void libblis_test_trmm3_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,7 +154,7 @@ void libblis_test_trmm3_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -168,6 +168,8 @@ void libblis_test_trmm3_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 	dim_t        mn_side;
 
@@ -181,6 +183,9 @@ void libblis_test_trmm3_experiment
 	obj_t        c_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c
index 7b9143f92..bd39d30e1 100644
--- a/testsuite/src/test_trmv.c
+++ b/testsuite/src/test_trmv.c
@@ -59,7 +59,7 @@ void libblis_test_trmv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -145,7 +145,7 @@ void libblis_test_trmv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -159,6 +159,8 @@ void libblis_test_trmv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -169,6 +171,9 @@ void libblis_test_trmv_experiment
 	obj_t        x_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c
index e571ac0bb..23cb1e5b5 100644
--- a/testsuite/src/test_trsm.c
+++ b/testsuite/src/test_trsm.c
@@ -59,7 +59,7 @@ void libblis_test_trsm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -150,7 +150,7 @@ void libblis_test_trsm_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -164,6 +164,8 @@ void libblis_test_trsm_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 	dim_t        mn_side;
 
@@ -176,6 +178,9 @@ void libblis_test_trsm_experiment
 	obj_t        b_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to actual dimensions.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index eb3c06520..555cf9fbb 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -59,7 +59,7 @@ void libblis_test_trsm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -154,7 +154,7 @@ void libblis_test_trsm_ukr_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -168,6 +168,8 @@ void libblis_test_trsm_ukr_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m, n;
 
 	char         sc_a = 'c';
@@ -182,9 +184,13 @@ void libblis_test_trsm_ukr_experiment
 
 	cntx_t*      cntx;
 
+
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Fix m and n to MR and NR, respectively.
 	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
 	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c
index bedf5039a..6bc3b220f 100644
--- a/testsuite/src/test_trsv.c
+++ b/testsuite/src/test_trsv.c
@@ -59,7 +59,7 @@ void libblis_test_trsv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -145,7 +145,7 @@ void libblis_test_trsv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -159,6 +159,8 @@ void libblis_test_trsv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	uplo_t       uploa;
@@ -169,6 +171,9 @@ void libblis_test_trsv_experiment
 	obj_t        x_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
diff --git a/testsuite/src/test_xpbym.c b/testsuite/src/test_xpbym.c
new file mode 100644
index 000000000..1192fdb10
--- /dev/null
+++ b/testsuite/src/test_xpbym.c
@@ -0,0 +1,314 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2018, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "test_libblis.h"
+
+
+// Static variables.
+static char*     op_str                    = "xpbym";
+static char*     o_types                   = "mm";  // x y
+static char*     p_types                   = "h";   // transx
+static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
+                                               { 1e-04, 1e-05 },   // warn, pass for c
+                                               { 1e-13, 1e-14 },   // warn, pass for d
+                                               { 1e-13, 1e-14 } }; // warn, pass for z
+
+// Local prototypes.
+void libblis_test_xpbym_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
+void libblis_test_xpbym_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     );
+
+void libblis_test_xpbym_impl
+     (
+       iface_t   iface,
+       obj_t*    x,
+       obj_t*    beta,
+       obj_t*    y
+     );
+
+void libblis_test_xpbym_check
+     (
+       test_params_t* params,
+       obj_t*         x,
+       obj_t*         beta,
+       obj_t*         y,
+       obj_t*         y_save,
+       double*        resid
+     );
+
+
+
+void libblis_test_xpbym_deps
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+	libblis_test_randm( tdata, params, &(op->ops->randm) );
+	libblis_test_normfm( tdata, params, &(op->ops->normfm) );
+	libblis_test_addm( tdata, params, &(op->ops->addm) );
+	libblis_test_subm( tdata, params, &(op->ops->subm) );
+	libblis_test_copym( tdata, params, &(op->ops->copym) );
+	libblis_test_scalm( tdata, params, &(op->ops->scalm) );
+}
+
+
+
+void libblis_test_xpbym
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     )
+{
+
+	// Return early if this test has already been done.
+	if ( libblis_test_op_is_done( op ) ) return;
+
+	// Return early if operation is disabled.
+	if ( libblis_test_op_is_disabled( op ) ||
+	     libblis_test_l1m_is_disabled( op ) ) return;
+
+	// Call dependencies first.
+	if ( TRUE ) libblis_test_xpbym_deps( tdata, params, op );
+
+	// Execute the test driver for each implementation requested.
+	//if ( op->front_seq == ENABLE )
+	{
+		libblis_test_op_driver( tdata,
+		                        params,
+		                        op,
+		                        BLIS_TEST_SEQ_FRONT_END,
+		                        op_str,
+		                        p_types,
+		                        o_types,
+		                        thresh,
+		                        libblis_test_xpbym_experiment );
+	}
+}
+
+
+
+void libblis_test_xpbym_experiment
+     (
+       test_params_t* params,
+       test_op_t*     op,
+       iface_t        iface,
+       char*          dc_str,
+       char*          pc_str,
+       char*          sc_str,
+       unsigned int   p_cur,
+       double*        perf,
+       double*        resid
+     )
+{
+	unsigned int n_repeats = params->n_repeats;
+	unsigned int i;
+
+	double       time_min  = DBL_MAX;
+	double       time;
+
+	num_t        datatype;
+
+	dim_t        m, n;
+
+	trans_t      transx;
+
+	obj_t        x, beta, y;
+	obj_t        y_save;
+
+
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
+	// Map the dimension specifier to actual dimensions.
+	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
+	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
+
+	// Map parameter characters to BLIS constants.
+	bli_param_map_char_to_blis_trans( pc_str[0], &transx );
+
+	// Create test scalars.
+	bli_obj_scalar_init_detached( datatype, &beta );
+
+	// Create test operands (vectors and/or matrices).
+	libblis_test_mobj_create( params, datatype, transx,
+	                          sc_str[0], m, n, &x );
+	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	                          sc_str[0], m, n, &y );
+	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
+	                          sc_str[0], m, n, &y_save );
+
+	// Set beta.
+	if ( bli_obj_is_real( &y ) )
+		bli_setsc( -2.0,  0.0, &beta );
+	else
+		bli_setsc(  0.0, -2.0, &beta );
+
+	// Randomize and save y.
+	libblis_test_mobj_randomize( params, FALSE, &x );
+	libblis_test_mobj_randomize( params, FALSE, &y );
+	bli_copym( &y, &y_save );
+
+	// Apply the parameters.
+	bli_obj_set_conjtrans( transx, &x );
+
+	// Repeat the experiment n_repeats times and record results. 
+	for ( i = 0; i < n_repeats; ++i )
+	{
+		bli_copym( &y_save, &y );
+
+		time = bli_clock();
+
+		libblis_test_xpbym_impl( iface, &x, &beta, &y );
+
+		time_min = bli_clock_min_diff( time_min, time );
+	}
+
+	// Estimate the performance of the best experiment repeat.
+	*perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+	if ( bli_obj_is_complex( &y ) ) *perf *= 4.0;
+
+	// Perform checks.
+	libblis_test_xpbym_check( params, &x, &beta, &y, &y_save, resid );
+
+	// Zero out performance and residual if output matrix is empty.
+	libblis_test_check_empty_problem( &y, perf, resid );
+
+	// Free the test objects.
+	bli_obj_free( &x );
+	bli_obj_free( &y );
+	bli_obj_free( &y_save );
+}
+
+
+
+void libblis_test_xpbym_impl
+     (
+       iface_t   iface,
+       obj_t*    x,
+       obj_t*    beta,
+       obj_t*    y
+     )
+{
+	switch ( iface )
+	{
+		case BLIS_TEST_SEQ_FRONT_END:
+		bli_xpbym( x, beta, y );
+		break;
+
+		default:
+		libblis_test_printf_error( "Invalid interface type.\n" );
+	}
+}
+
+
+
+void libblis_test_xpbym_check
+     (
+       test_params_t* params,
+       obj_t*         x,
+       obj_t*         beta,
+       obj_t*         y,
+       obj_t*         y_orig,
+       double*        resid
+     )
+{
+	num_t  dt      = bli_obj_dt( y );
+	num_t  dt_real = bli_obj_dt_proj_to_real( y );
+
+	dim_t  m       = bli_obj_length( y );
+	dim_t  n       = bli_obj_width( y );
+
+	obj_t  x_temp, y_temp;
+	obj_t  norm;
+
+	double junk;
+
+	//
+	// Pre-conditions:
+	// - x is randomized.
+	// - y_orig is randomized.
+	// Note:
+	// - alpha should have a non-zero imaginary component in the complex
+	//   cases in order to more fully exercise the implementation.
+	//
+	// Under these conditions, we assume that the implementation for
+	//
+	//   y := beta * y_orig + conjx(x)
+	//
+	// is functioning correctly if
+	//
+	//   normf( y - ( beta * y_orig + conjx(x) ) )
+	//
+	// is negligible.
+	//
+
+	bli_obj_scalar_init_detached( dt_real, &norm );
+
+    bli_obj_create( dt, m, n, 0, 0, &x_temp );
+    bli_obj_create( dt, m, n, 0, 0, &y_temp );
+
+    bli_copym( x,      &x_temp );
+    bli_copym( y_orig, &y_temp );
+
+    bli_scalm( beta, &y_temp );
+	bli_addm( &x_temp, &y_temp );
+
+    bli_subm( &y_temp, y );
+    bli_normfm( y, &norm );
+    bli_getsc( &norm, resid, &junk );
+
+    bli_obj_free( &x_temp );
+    bli_obj_free( &y_temp );
+}
+
diff --git a/testsuite/src/test_xpbym.h b/testsuite/src/test_xpbym.h
new file mode 100644
index 000000000..c272b1b90
--- /dev/null
+++ b/testsuite/src/test_xpbym.h
@@ -0,0 +1,41 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2018, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void libblis_test_xpbym
+     (
+       thread_data_t* tdata,
+       test_params_t* params,
+       test_op_t*     op
+     );
+
diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c
index 694d19f30..8fc9b7201 100644
--- a/testsuite/src/test_xpbyv.c
+++ b/testsuite/src/test_xpbyv.c
@@ -59,7 +59,7 @@ void libblis_test_xpbyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -143,7 +143,7 @@ void libblis_test_xpbyv_experiment
        test_params_t* params,
        test_op_t*     op,
        iface_t        iface,
-       num_t          datatype,
+       char*          dc_str,
        char*          pc_str,
        char*          sc_str,
        unsigned int   p_cur,
@@ -157,6 +157,8 @@ void libblis_test_xpbyv_experiment
 	double       time_min  = DBL_MAX;
 	double       time;
 
+	num_t        datatype;
+
 	dim_t        m;
 
 	conj_t       conjx;
@@ -165,6 +167,9 @@ void libblis_test_xpbyv_experiment
 	obj_t        y_save;
 
 
+	// Use the datatype of the first char in the datatype combination string.
+	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
+
 	// Map the dimension specifier to an actual dimension.
 	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );