diff --git a/CREDITS b/CREDITS
index 17e9e14f2..f1fa0b71f 100644
--- a/CREDITS
+++ b/CREDITS
@@ -35,13 +35,14 @@ but many others have contributed code and feedback, including
   Tony Kelman           @tkelman
   Lee Killough                              (Cray)
   Mike Kistler                              (IBM, Austin Research Laboratory)
+  Michael Lehn          @michael-lehn
+  Dave Love             @loveshack
+  Tze Meng Low                              (The University of Texas at Austin)
+  Ye Luo                @ye-luo             (Argonne National Laboratory)
   Ricardo Magana        @magania            (Hewlett Packard Enterprise)
   Bryan Marker          @bamarker           (The University of Texas at Austin)
   Devin Matthews        @devinamatthews     (The University of Texas at Austin)
   Stefanos Mavros       @smavros
-  Michael Lehn          @michael-lehn
-  Dave Love             @loveshack
-  Tze Meng Low                              (The University of Texas at Austin)
   Nisanth Padinharepatt                     (AMD)
   Devangi Parikh        @dnparikh           (The University of Texas at Austin)
   Elmar Peise           @elmar-peise        (RWTH-Aachen)
diff --git a/build/add-copyright.py b/build/add-copyright.py
index 0d5e52d5e..9a18b95fc 100755
--- a/build/add-copyright.py
+++ b/build/add-copyright.py
@@ -187,6 +187,8 @@ def main():
 		else:
 			filename = git_words[1]
 
+		#my_echo( "-debug---- %s" % filename )
+
 		# Start by opening the file. (We can assume it exists since it
 		# was found by 'git status', so no need to check for existence.)
 		# Read all lines in the file and then close it.
@@ -203,7 +205,7 @@ def main():
 		# If the file does not have any copyright notice in it already, we
 		# assume we don't need to update it.
 		if not has_cr:
-			my_echo( "[skipped] %s" % filename )
+			my_echo( "[nocrline] %s" % filename )
 			continue
 
 		# Check whether the file already has a copyright for the_org. We may
@@ -214,7 +216,7 @@ def main():
 		mod_file_lines = []
 
 		# At this point we know that the file has at least one copyright, and
-		# has_org_cr encodes whether already has a copyright for the_org.
+		# has_org_cr encodes whether it already has a copyright for the_org.
 
 		# We process the files that we know already have copyrights for the_org
 		# differently from the files that do not yet have them.
@@ -240,12 +242,15 @@ def main():
 						repl_line = ' %s, ' % cur_year
 						line_ny = re.sub( find_line, repl_line, line )
 
-						my_echo( "[updated] %s" % filename )
+						my_echo( "[updated ] %s" % filename )
 
 						# Add the updated line to the running list.
 						mod_file_lines += line_ny
 
 					else:
+
+						my_echo( "[up2date ] %s" % filename )
+
 						# Add the unchanged line to the running list.
 						mod_file_lines += line
 							
@@ -262,7 +267,7 @@ def main():
 			# Don't go any further if we're only updating existing copyright
 			# lines.
 			if update_only:
-				my_echo( "[skipped] %s" % filename )
+				my_echo( "[nocrline] %s" % filename )
 				continue
 
 			num_file_lines = len( file_lines )
@@ -313,7 +318,7 @@ def main():
 						mod_file_lines += line
 						mod_file_lines += line_nyno
 
-						my_echo( "[added  ] %s" % filename )
+						my_echo( "[added   ] %s" % filename )
 
 					# endif resnext
 
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index b7e5adf85..2fa1fb127 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -52,6 +53,14 @@
 #define BLIS_ENABLE_PTHREADS
 #endif
 
+#if @enable_jrir_slab@
+#define BLIS_ENABLE_JRIR_SLAB
+#endif
+
+#if @enable_jrir_rr@
+#define BLIS_ENABLE_JRIR_RR
+#endif
+
 #if @enable_packbuf_pools@
 #define BLIS_ENABLE_PACKBUF_POOLS
 #endif
diff --git a/build/irun.py b/build/irun.py
new file mode 100755
index 000000000..97cc39c2f
--- /dev/null
+++ b/build/irun.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2018, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas at Austin nor the names
+#     of its contributors may be used to endorse or promote products
+#     derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Import modules
+import os
+import sys
+import getopt
+import re
+import subprocess
+import time
+import statistics
+
+
+def print_usage():
+
+	my_print( " " )
+	my_print( " %s" % script_name )
+	my_print( " " )
+	my_print( " Field G. Van Zee" )
+	my_print( " " )
+	my_print( " Repeatedly run a test driver and accumulate statistics for the" )
+	my_print( " output." )
+	my_print( " " )
+	my_print( " Usage:" )
+	my_print( " " )
+	my_print( "   %s [options] drivername" % script_name )
+	my_print( " " )
+	my_print( " Arguments:" )
+	my_print( " " )
+	my_print( "   drivername    The filename/path of the test driver to run. The" )
+	my_print( "                 test driver must output its performance data to" )
+	my_print( "                 standard output." )
+	my_print( " " )
+	my_print( " The following options are accepted:" )
+	my_print( " " )
+	my_print( "   -c num      performance column index" )
+	my_print( "                 Find the performance result in column index <num> of" )
+	my_print( "                 the test driver's output. Here, a column is defined" )
+	my_print( "                 as a contiguous sequence of non-whitespace characters," )
+	my_print( "                 with the column indices beginning at 0. By default," )
+	my_print( "                 the second-to-last column index in the output is used." )
+	my_print( " " )
+	my_print( "   -d delay    sleep() delay" )
+	my_print( "                 Wait <delay> seconds after each execution of the" )
+	my_print( "                 test driver. The default delay is 0." )
+	my_print( " " )
+	my_print( "   -n niter    number of iterations" )
+	my_print( "                 Execute the test driver <niter> times. The default" )
+	my_print( "                 value is 10." )
+	my_print( " " )
+	my_print( "   -q          quiet; summary only" )
+	my_print( "                 Do not output statistics after every new execution of" )
+	my_print( "                 the test driver; instead, only output the final values" )
+	my_print( "                 after all iterations are complete. The default is to" )
+	my_print( "                 output updated statistics after each iteration." )
+	my_print( " " )
+	my_print( "   -h          help" )
+	my_print( "                 Output this information and exit." )
+	my_print( " " )
+
+
+# ------------------------------------------------------------------------------
+
+def my_print( s ):
+
+	sys.stdout.write( "%s\n" % s )
+	#sys.stdout.flush()
+
+# ------------------------------------------------------------------------------
+
+# Global variables.
+script_name    = None
+output_name    = None
+
+def main():
+
+	global script_name
+	global output_name
+
+	# Obtain the script name.
+	path, script_name = os.path.split(sys.argv[0])
+
+	output_name = script_name
+
+	# Default values for optional arguments.
+	#perf_col = 9
+	perf_col = -1
+	delay    = 0
+	niter    = 10
+	quiet    = False
+
+	# Process our command line options.
+	try:
+		opts, args = getopt.getopt( sys.argv[1:], "c:d:n:hq" )
+
+	except getopt.GetoptError as err:
+		# print help information and exit:
+		my_print( str(err) ) # will print something like "option -a not recognized"
+		print_usage()
+		sys.exit(2)
+
+	for opt, optarg in opts:
+		if   opt == "-c":
+			perf_col = optarg
+		elif opt == "-d":
+			delay = optarg
+		elif opt == "-n":
+			niter = optarg
+		elif opt == "-q":
+			quiet = True
+		elif opt == "-h":
+			print_usage()
+			sys.exit()
+		else:
+			print_usage()
+			sys.exit()
+
+	# Print usage if we don't have exactly one argument.
+	if len( args ) != 1:
+		print_usage()
+		sys.exit()
+
+	# Acquire our only mandatory argument: the name of the test driver.
+	driverfile = args[0]
+
+	#my_print( "test driver: %s" % driverfile )
+	#my_print( "column num:  %s" % perf_col )
+	#my_print( "delay:       %s" % delay )
+	#my_print( "num iter:    %s" % niter )
+
+	# Build a list of iterations.
+	iters = range( int(niter) )
+
+	# Run the test driver once to detect the number of lines of output.
+	p = subprocess.run( driverfile, stdout=subprocess.PIPE )
+	lines0 = p.stdout.decode().splitlines()
+	num_lines0 = int(len(lines0))
+
+	# Initialize the list of lists (one list per performance result).
+	aperf = []
+	for i in range( num_lines0 ):
+		aperf.append( [] )
+
+	for it in iters:
+
+		# Run the test driver.
+		p = subprocess.run( driverfile, stdout=subprocess.PIPE )
+
+		# Acquire the lines of output.
+		lines = p.stdout.decode().splitlines()
+
+		# Accumulate the test driver's latest results into aperf.
+		for i in range( num_lines0 ):
+
+			# Parse the current line to find the performance value.
+			line  = lines[i]
+			words = line.split()
+			if perf_col == -1:
+				perf  = words[ len(words)-2 ]
+			else:
+				perf  = words[ int(perf_col) ]
+
+			# As unlikely as it is, guard against Inf and NaN.
+			if float(perf) ==  float('Inf') or \
+			   float(perf) == -float('Inf') or \
+			   float(perf) ==  float('NaN'): perf = 0.0
+
+			# Add the performance value to the list at the ith entry of aperf.
+			aperf[i].append( float(perf) )
+
+			# Compute stats for the current line.
+			avgp = statistics.mean( aperf[i] )
+			maxp =             max( aperf[i] )
+			minp =             min( aperf[i] )
+
+			# Only compute stdev() when we have two or more data points.
+			if len( aperf[i] ) > 1: stdp = statistics.stdev( aperf[i] )
+			else:                   stdp = 0.0
+
+			# Construct a string to match the performance value and then
+			# use that string to search-and-replace with four format specs
+			# for the min, avg, max, and stdev values computed above.
+			search = '%8s' % perf
+			newline = re.sub( str(search), ' %7.2f %7.2f %7.2f %6.2f', line )
+
+			# Search for the column index range that would be present if this were
+			# matlab-compatible output. The index range will typically be 1:n,
+			# where n is the number of columns of data.
+			found_index = False
+			for word in words:
+				if re.match( '1:', word ):
+					index_str = word
+					found_index = True
+					break
+
+			# If we find the column index range, we need to update it to reflect
+			# the replacement of one column of data with four, for a net increase
+			# of columns. We do so via another instance of re.sub() in which we
+			# search for the old index string and replace it with the new one.
+			if found_index:
+				last_col = int(index_str[2]) + 3
+				new_index_str = '1:%1s' % last_col
+				newline = re.sub( index_str, new_index_str, newline )
+
+			# If the quiet flag was not give, output the intermediate results.
+			if not quiet:
+				print( newline % ( float(minp), float(avgp), float(maxp), float(stdp) ) )
+
+		# Flush stdout after each set of output prior to sleeping.
+		sys.stdout.flush()
+
+		# Sleep for a bit until the next iteration.
+		time.sleep( int(delay) )
+
+	# If the quiet flag was given, output the final results.
+	if quiet:
+
+		for i in range( num_lines0 ):
+
+			# Parse the current line to find the performance value (only
+			# needed for call to re.sub() below).
+			line  = lines0[i]
+			words = line.split()
+			if perf_col == -1:
+				perf  = words[ len(words)-2 ]
+			else:
+				perf  = words[ int(perf_col) ]
+
+			# Compute stats for the current line.
+			avgp = statistics.mean( aperf[i] )
+			maxp =             max( aperf[i] )
+			minp =             min( aperf[i] )
+
+			# Only compute stdev() when we have two or more data points.
+			if len( aperf[i] ) > 1: stdp = statistics.stdev( aperf[i] )
+			else:                   stdp = 0.0
+
+			# Construct a string to match the performance value and then
+			# use that string to search-and-replace with four format specs
+			# for the min, avg, max, and stdev values computed above.
+			search = '%8s' % perf
+			newline = re.sub( str(search), ' %7.2f %7.2f %7.2f %6.2f', line )
+
+			# Search for the column index range that would be present if this were
+			# matlab-compatible output. The index range will typically be 1:n,
+			# where n is the number of columns of data.
+			found_index = False
+			for word in words:
+				if re.match( '1:', word ):
+					index_str = word
+					found_index = True
+					break
+
+			# If we find the column index range, we need to update it to reflect
+			# the replacement of one column of data with four, for a net increase
+			# of columns. We do so via another instance of re.sub() in which we
+			# search for the old index string and replace it with the new one.
+			if found_index:
+				last_col = int(index_str[2]) + 3
+				new_index_str = '1:%1s' % last_col
+				newline = re.sub( index_str, new_index_str, newline )
+
+			# Output the results for the current line.
+			print( newline % ( float(minp), float(avgp), float(maxp), float(stdp) ) )
+
+		# Flush stdout afterwards.
+		sys.stdout.flush()
+
+
+	# Return from main().
+	return 0
+
+
+
+
+if __name__ == "__main__":
+	main()
diff --git a/common.mk b/common.mk
index c4ea93e22..42be1590b 100644
--- a/common.mk
+++ b/common.mk
@@ -438,7 +438,7 @@ INSTALL    := install -c
 
 # Script for creating a monolithic header file.
 #FLATTEN_H  := $(DIST_PATH)/build/flatten-headers.sh
-FLATTEN_H  := $(DIST_PATH)/build/flatten-headers.py
+FLATTEN_H  := $(PYTHON) $(DIST_PATH)/build/flatten-headers.py
 
 # Default archiver flags.
 ARFLAGS    := cr
diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk
index 35478e0f4..5c3cc8d04 100644
--- a/config/bgq/make_defs.mk
+++ b/config/bgq/make_defs.mk
@@ -51,7 +51,13 @@ THIS_CONFIG    := bgq
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.
 CPPROCFLAGS    := -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk
+ifeq ($(CC_VENDOR),ibm)
 CMISCFLAGS     := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist
+else ifeq ($(CC_VENDOR),clang)
+CMISCFLAGS     := -fopenmp
+else
+$(error xlc or bgclang is required for this configuration.)
+endif
 CPICFLAGS      :=
 CWARNFLAGS     := -w
 
@@ -69,8 +75,6 @@ endif
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),ibm)
 CKVECFLAGS     := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa
-else
-$(error xlc is required for this configuration.)
 endif
 
 # Flags specific to reference kernels.
@@ -78,7 +82,11 @@ CROPTFLAGS     := $(CKOPTFLAGS)
 CRVECFLAGS     := $(CKVECFLAGS)
 
 # Override the default value for LDFLAGS.
+ifeq ($(CC_VENDOR),ibm)
 LDFLAGS        := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -qthreaded -qsmp=omp
+else ifeq ($(CC_VENDOR),clang)
+LDFLAGS        := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -fopenmp
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index 9d9e16295..844a161f3 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -50,22 +50,22 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  8,
 	  // gemm
 #if 1
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
 #else
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_16x6,       FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_8x6,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_zen_asm_8x3,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen_asm_4x3,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_16x6,       FALSE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_8x6,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,        FALSE,
 #endif
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
 	  cntx
 	);
 
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 6507f421b..7b4fa01e2 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -49,16 +49,16 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	(
 	  8,
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
 	  cntx
 	);
 
diff --git a/config_registry b/config_registry
index ffa24983e..0d1cbbe19 100644
--- a/config_registry
+++ b/config_registry
@@ -15,14 +15,14 @@ arm64:       cortexa57 generic
 arm32:       cortexa15 cortexa9 generic
 
 # Intel architectures.
-skx:         skx/skx/zen
-knl:         knl/knl/zen
+skx:         skx/skx/haswell/zen
+knl:         knl/knl/haswell/zen
 haswell:     haswell/haswell/zen
 sandybridge: sandybridge
 penryn:      penryn
 
 # AMD architectures.
-zen:         zen
+zen:         zen/zen/haswell
 excavator:   excavator/piledriver
 steamroller: steamroller/piledriver
 piledriver:  piledriver
@@ -34,5 +34,8 @@ cortexa53:   cortexa53/armv8a
 cortexa15:   cortexa15/armv7a
 cortexa9:    cortexa9/armv7a
 
+# IBM architectures.
+bgq:         bgq
+
 # Generic architectures.
 generic:     generic
diff --git a/configure b/configure
index e5c17fd5f..9fcf5605f 100755
--- a/configure
+++ b/configure
@@ -163,9 +163,6 @@ print_usage()
 	echo "                 incur additional overhead in some (but not all)"
 	echo "                 situations."
 	echo " "
-	echo "   -q, --quiet   Suppress informational output. By default, configure"
-	echo "                 is verbose. (NOTE: -q is not yet implemented)"
-	echo " "
 	echo "   -i SIZE, --int-size=SIZE"
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
@@ -230,6 +227,19 @@ print_usage()
 	echo "                 detects the presence of libmemkind, libmemkind is used"
 	echo "                 by default, and otherwise it is not used by default."
 	echo " "
+	echo "   -r METHOD, --thread-part-jrir=METHOD"
+	echo " "
+	echo "                 Request a method of assigning micropanels to threads in"
+	echo "                 the JR and IR loops. Valid options are 'slab' and 'rr'."
+	echo "                 Using 'slab' assigns (as much as possible) contiguous"
+	echo "                 regions of micropanels to each thread while the latter"
+	echo "                 assigns micropanels to threads in a round-robin fashion."
+	echo "                 (NOTE: Specifying this option constitutes a *request*,"
+	echo "                 which may be ignored in select situations if the"
+	echo "                 implementation has a good reason to do so.) The chosen"
+	echo "                 method also applies during the packing of A and B. The"
+	echo "                 default method is 'slab'."
+	echo " "
 	echo "   --force-version=STRING"
 	echo " "
 	echo "                 Force configure to use an arbitrary version string"
@@ -244,6 +254,9 @@ print_usage()
 	echo "                 a sanity check to make sure these lists are constituted"
 	echo "                 as expected."
 	echo " "
+	echo "   -q, --quiet   Suppress informational output. By default, configure"
+	echo "                 is verbose. (NOTE: -q is not yet implemented)"
+	echo " "
 	echo "   -h, --help    Output this information and quit."
 	echo " "
 	echo " Environment Variables:"
@@ -1609,6 +1622,9 @@ main()
 	# The threading flag.
 	threading_model='no'
 
+	# The method of assigning micropanels to threads in the JR and JR loops.
+	thread_part_jrir='slab'
+
 	# Option variables.
 	quiet_flag=''
 	show_config_list=''
@@ -1661,7 +1677,7 @@ main()
 	# -- Command line option/argument parsing ----------------------------------
 
 	# Process our command line options.
-	while getopts ":hp:d:s:t:qci:b:-:" opt; do
+	while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
 		case $opt in
 			-)
 				case "$OPTARG" in
@@ -1725,6 +1741,9 @@ main()
 					enable-threading=*)
 						threading_model=${OPTARG#*=}
 						;;
+					thread-part-jrir=*)
+						thread_part_jrir=${OPTARG#*=}
+						;;
 					disable-threading)
 						threading_model='no'
 						;;
@@ -1808,6 +1827,9 @@ main()
 			t)
 				threading_model=$OPTARG
 				;;
+			r)
+				thread_part_jrir=$OPTARG
+				;;
 			i)
 				int_type_size=$OPTARG
 				;;
@@ -1823,7 +1845,7 @@ main()
 		esac
 	done
 	shift $(($OPTIND - 1))
-	
+
 	# Parse environment variables
 	while [ $# -gt 0 ]; do
 		case $1 in
@@ -2383,7 +2405,7 @@ main()
 	elif [ "x${threading_model}" = "xpthreads" ] ||
 	     [ "x${threading_model}" = "xpthread" ] ||
 	     [ "x${threading_model}" = "xposix" ]; then
-		echo "${script_name}: using Pthreads for threading."
+		echo "${script_name}: using POSIX threads for threading."
 		enable_pthreads='yes'
 		enable_pthreads_01=1
 		threading_model="pthreads" # Standardize the value.
@@ -2394,7 +2416,22 @@ main()
 		echo "${script_name}: *** Unsupported threading model: ${threading_model}."
 		exit 1
 	fi
-	
+
+	# Check the method of assigning micropanels to threads in the JR and IR
+	# loops.
+	enable_jrir_slab_01=0
+	enable_jrir_rr_01=0
+	if [ "x${thread_part_jrir}" = "xslab" ]; then
+		echo "${script_name}: requesting slab threading in jr and ir loops."
+		enable_jrir_slab_01=1
+	elif [ "x${thread_part_jrir}" = "xrr" ]; then
+		echo "${script_name}: requesting round-robin threading in jr and ir loops."
+		enable_jrir_rr_01=1
+	else
+		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}."
+		exit 1
+	fi
+
 	# Convert 'yes' and 'no' flags to booleans.
 	if [ "x${enable_packbuf_pools}" = "xyes" ]; then
 		echo "${script_name}: internal memory pools for packing buffers are enabled."
@@ -2461,16 +2498,7 @@ main()
 	else
 		echo "${script_name}: mixed datatype support is disabled."
 
-		if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then
-			echo "${script_name}: *** Mixed datatype optimizations requiring extra memory are only"
-			echo "${script_name}: *** available when mixed datatype support is also enabled."
-			echo "${script_name}: *** Please enable mixed datatype support, or disable mixed datatype"
-			echo "${script_name}: *** optimizations requiring extra memory, and re-run configure."
-			exit 1
-		else
-			enable_mixed_dt_extra_mem_01=0
-		fi
-
+		enable_mixed_dt_extra_mem_01=0
 		enable_mixed_dt_01=0
 	fi
 
@@ -2649,6 +2677,8 @@ main()
 		| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
 		| sed   -e "s/@enable_openmp@/${enable_openmp_01}/g" \
 		| sed   -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
+		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
+		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
 		| sed   -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \
 		| sed   -e "s/@int_type_size@/${int_type_size}/g" \
 		| sed   -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
@@ -2742,7 +2772,7 @@ main()
 
 
 	# -- Mirror source directory hierarchies to object directories -------------
-	
+
 	# Combine the config_list with the config_name and then remove duplicates.
 	config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}")
 
diff --git a/docs/MixedDatatypes.md b/docs/MixedDatatypes.md
index 90c2a8703..ce9981389 100644
--- a/docs/MixedDatatypes.md
+++ b/docs/MixedDatatypes.md
@@ -205,6 +205,17 @@ operands and thus fixed; the user may not specify a different computation
 domain, even if the mixed-domain case would reasonably allow for computing
 in either domain.
 
+* **Sandboxes should be used with caution.** When building a `gemm` sandbox in
+BLIS, please consider either (a) disabling mixed datatype support, or (b)
+consciously **never** running the testsuite with mixed domain or precision
+computation enabled. Even the reference `ref99` sandbox implementation in BLIS
+does not support mixing datatypes. If you do choose to enable a sandbox while
+also keeping mixed datatype support enabled in BLIS, make sure that the
+mixing of datatypes is disabled in the testsuite's `input.general` file
+(unless, of course, you decide to implement all mixed datatype cases within
+your sandbox). This issue is also discussed in the documentation for
+[Sandboxes](Sandboxes.md#known-issues).
+
 ## Conclusion
 
 For more information and documentation on BLIS, please visit the [BLIS github page](https://github.com/flame/blis/).
diff --git a/docs/Sandboxes.md b/docs/Sandboxes.md
index 896e7332e..a205be02c 100644
--- a/docs/Sandboxes.md
+++ b/docs/Sandboxes.md
@@ -4,6 +4,7 @@
 * **[Enabling a sandbox](Sandboxes.md#enabling-a-sandbox)**
 * **[Sandbox rules](Sandboxes.md#sandbox-rules)**
 * **[Caveats](Sandboxes.md#caveats)**
+* **[Known Issues](Sandboxes.md#known-issues)**
 * **[Conclusion](Sandboxes.md#conclusion)**
 
 
@@ -182,6 +183,20 @@ guidance from BLIS developers by opening a
 Notwithstanding these limitations, hopefully you still find BLIS sandboxes
 useful!
 
+## Known Issues
+
+* **Mixed datatype support.** Unless you *really* know what you are doing, you
+should probably disable mixed datatype support when using a sandbox. (Mixed
+datatype support can be disabled by configuring with `--disable-mixed-dt`.) The
+BLIS testsuite is smart enough to verify that you've configured BLIS with mixed
+datatype support before allowing you to test with mixed domains/precisions
+enabled in `input.general`. However, if those options *are* enabled and BLIS was
+built with mixed datatype support, then BLIS assumes that the implementation of
+`gemm` will support mixing of datatypes. BLIS *must* assume this, because
+there's no way for it to confirm at runtime that an implementation was written
+to support mixing datatypes. Note that even the `ref99` sandbox included with
+BLIS does not support mixed-datatype computation.
+
 ## Conclusion
 
 If you encounter any problems, or are really bummed-out that `gemm` is the
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 6c88ea893..194c66a65 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,9 +40,7 @@
 
 #include "bli_packm_part.h"
 
-#include "bli_packm_unb_var1.h"
-
-#include "bli_packm_blk_var1.h"
+#include "bli_packm_var.h"
 
 #include "bli_packm_struc_cxk.h"
 #include "bli_packm_struc_cxk_4mi.h"
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 195315886..3265b3beb 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -36,29 +37,30 @@
 
 #define FUNCPTR_T packm_fp
 
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           pack_t  schema,
-                           bool_t  invdiag,
-                           bool_t  revifup,
-                           bool_t  reviflo,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_max,
-                           dim_t   n_max,
-                           void*   kappa,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      inc_t is_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   packm_ker,
-                           cntx_t* cntx,
-                           thrinfo_t* thread
-                         );
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       pack_t  schema,
+       bool_t  invdiag,
+       bool_t  revifup,
+       bool_t  reviflo,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_max,
+       dim_t   n_max,
+       void*   kappa,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  inc_t is_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   packm_ker,
+       cntx_t* cntx,
+       thrinfo_t* thread
+     );
 
 static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
 
@@ -195,7 +197,7 @@ void bli_packm_blk_var1
 			// use BLIS_ONE to indicate no scaling during packing.
 			kappa_p = &BLIS_ONE;
 		}
-	
+
 		// Acquire the buffer to the kappa chosen above.
 		buf_kappa = bli_obj_buffer_for_1x1( dt_c, kappa_p );
 	}
@@ -307,7 +309,7 @@ void PASTEMAC(ch,varname) \
 	ctype* restrict p_begin; \
 \
 	dim_t           iter_dim; \
-	dim_t           num_iter; \
+	dim_t           n_iter; \
 	dim_t           it, ic, ip; \
 	dim_t           ic0, ip0; \
 	doff_t          ic_inc, ip_inc; \
@@ -418,16 +420,16 @@ void PASTEMAC(ch,varname) \
 	else                                    { ss_num = 1; ss_den = 1; } \
 \
 	/* Compute the total number of iterations we'll need. */ \
-	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
 \
 	/* Set the initial values and increments for indices related to C and P
 	   based on whether reverse iteration was requested. */ \
 	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
 	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
 	{ \
-		ic0    = (num_iter - 1) * panel_dim_max; \
+		ic0    = (n_iter - 1) * panel_dim_max; \
 		ic_inc = -panel_dim_max; \
-		ip0    = num_iter - 1; \
+		ip0    = n_iter - 1; \
 		ip_inc = -1; \
 	} \
 	else \
@@ -440,16 +442,21 @@ void PASTEMAC(ch,varname) \
 \
 	p_begin = p_cast; \
 \
-/*
-if ( row_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-if ( col_stored ) \
-PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
-                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
 \
-	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < num_iter; \
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	/* Iterate over every logical micropanel in the source matrix. */ \
+	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter; \
 	      ic += ic_inc, ip += ip_inc, it += 1 ) \
 	{ \
 		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
@@ -514,7 +521,11 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
 			/* We nudge the imaginary stride up by one if it is odd. */ \
 			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
 \
-			if( packm_thread_my_iter( it, thread ) ) \
+			/* NOTE: We MUST use round-robin partitioning when packing
+			   micropanels of a triangular matrix. Hermitian/symmetric
+			   and general packing may use slab or round-robin, depending
+			   on which was selected at configure-time. */ \
+			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				packm_ker_cast( strucc, \
 				                diagoffp_i, \
@@ -553,7 +564,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
 \
 			is_p_use = is_p; \
 \
-			if( packm_thread_my_iter( it, thread ) ) \
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				packm_ker_cast( strucc, \
 				                diagoffc_i, \
@@ -589,7 +602,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
 \
 			is_p_use = is_p; \
 \
-			if( packm_thread_my_iter( it, thread ) ) \
+			/* The definition of bli_packm_my_iter() will depend on whether slab
+			   or round-robin partitioning was requested at configure-time. */ \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				packm_ker_cast( BLIS_GENERAL, \
 				                0, \
@@ -613,6 +628,23 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \
 			p_inc = ps_p; \
 		} \
 \
+		p_begin += p_inc; \
+\
+	} \
+}
+
+INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
+
+
+
+/*
+if ( row_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+if ( col_stored ) \
+PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
+                      c_cast,        rs_c, cs_c, "%4.1f", "" ); \
+*/
 /*
 if ( row_stored ) \
 PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
@@ -671,8 +703,7 @@ bli_thread_obarrier( thread ); \
 	} \
 bli_thread_obarrier( thread ); \
 } \
-*/ \
-\
+*/
 /*
 		if ( bli_is_4mi_packed( schema ) ) { \
 		printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
@@ -695,18 +726,11 @@ bli_thread_obarrier( thread ); \
 		                       ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
 		} \
 		} \
-*/ \
-/*
-*/ \
-\
-/*
-*/ \
+*/
 /*
 		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
 		                       ( ctype_r* )p_use,         rs_p, cs_p, "%4.1f", "" ); \
-*/ \
-\
-\
+*/
 /*
 		if ( row_stored ) { \
 		PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
@@ -719,9 +743,7 @@ bli_thread_obarrier( thread ); \
 		PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
 		                       ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
 		} \
-*/ \
-\
-\
+*/
 /*
 		if ( col_stored ) { \
 		PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
@@ -733,12 +755,4 @@ bli_thread_obarrier( thread ); \
 		PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
 		                       ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
 		} \
-*/ \
-\
-		p_begin += p_inc; \
-\
-	} \
-}
-
-INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
-
+*/
diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c
index 4efd0074c..0930f282b 100644
--- a/frame/1m/packm/bli_packm_blk_var1_md.c
+++ b/frame/1m/packm/bli_packm_blk_var1_md.c
@@ -146,7 +146,7 @@ void PASTEMAC2(chc,chp,varname) \
 	ctype_p* restrict p_begin; \
 \
 	dim_t             iter_dim; \
-	dim_t             num_iter; \
+	dim_t             n_iter; \
 	dim_t             it, ic, ip; \
 	doff_t            ic_inc, ip_inc; \
 	dim_t             panel_len_full; \
@@ -220,7 +220,7 @@ void PASTEMAC2(chc,chp,varname) \
 	} \
 \
 	/* Compute the total number of iterations we'll need. */ \
-	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
+	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
 \
 	{ \
 		ic_inc = panel_dim_max; \
@@ -229,16 +229,25 @@ void PASTEMAC2(chc,chp,varname) \
 \
 	p_begin = p_cast; \
 \
-/*
-if ( row_stored ) \
-PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: b orig", m, n, \
-                       c_cast,        rs_c, cs_c, "%5.2f", "" ); \
-if ( col_stored ) \
-PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \
-                       c_cast,        rs_c, cs_c, "%5.2f", "" ); \
-*/ \
+	/* Query the number of threads and thread ids from the current thread's
+	   packm thrinfo_t node. */ \
+	const dim_t nt  = bli_thread_n_way( thread ); \
+	const dim_t tid = bli_thread_work_id( thread ); \
 \
-	for ( ic  = 0,      ip  = 0,      it  = 0; it < num_iter; \
+	/* Suppress unused variable warnings when slab partitioning is enabled,
+	   since the slab-based definition of bli_packm_my_iter() does not
+	   actually use tid or nt. */ \
+	( void )nt; ( void )tid; \
+\
+	dim_t it_start, it_end, it_inc; \
+\
+	/* Determine the thread range and increment using the current thread's
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   will depend on whether slab or round-robin partitioning was requested
+	   at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+\
+	for ( ic  = 0,      ip  = 0,      it  = 0; it < n_iter; \
 	      ic += ic_inc, ip += ip_inc, it += 1 ) \
 	{ \
 		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
@@ -252,7 +261,7 @@ PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \
 			panel_len_i     = panel_len_full; \
 			panel_len_max_i = panel_len_max; \
 \
-			if( packm_thread_my_iter( it, thread ) ) \
+			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				PASTEMAC2(chc,chp,packm_struc_cxk_md) \
 				( \
diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h
index 41d68d356..6c77caf35 100644
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/1m/packm/bli_packm_thrinfo.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -36,7 +37,33 @@
 // thrinfo_t macros specific to packm.
 //
 
-#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+/*
+#define bli_packm_thread_my_iter( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )
+*/
+
+#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
+\
+	( i % n_way == work_id % n_way )
+
+#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
+\
+	( start <= i && i < end )
+
+// Define a general-purpose version of bli_packm_my_iter() whose definition
+// depends on whether slab or round-robin partitioning was requested at
+// configure-time.
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+  #define bli_packm_my_iter bli_packm_my_iter_sl
+
+#else // BLIS_ENABLE_JRIR_RR
+
+  #define bli_packm_my_iter bli_packm_my_iter_rr
+
+#endif
+
 
 //
 // thrinfo_t APIs specific to packm.
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_var.h
similarity index 72%
rename from frame/1m/packm/bli_packm_blk_var1.h
rename to frame/1m/packm/bli_packm_var.h
index 396160da5..7531bc9cb 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,15 +33,50 @@
 
 */
 
-void bli_packm_blk_var1
-     (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
-       cntl_t*  cntl,
-       thrinfo_t* t
+//
+// Prototype object-based interfaces.
+//
+
+#undef  GENPROT
+#define GENPROT( opname ) \
+\
+void PASTEMAC0(opname) \
+     ( \
+       obj_t*   c, \
+       obj_t*   p, \
+       cntx_t*  cntx, \
+       cntl_t*  cntl, \
+       thrinfo_t* t  \
      );
 
+GENPROT( packm_unb_var1 )
+GENPROT( packm_blk_var1 )
+
+//
+// Prototype BLAS-like interfaces with void pointer operands.
+//
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       struc_t strucc, \
+       doff_t  diagoffc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       trans_t transc, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   m_max, \
+       dim_t   n_max, \
+       void*   kappa, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       void*   p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx  \
+     );
+
+INSERT_GENTPROT_BASIC0( packm_unb_var1 )
 
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, varname ) \
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 228f22714..2110f1ec6 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,24 +39,34 @@
 
 // gemm
 
-#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
-#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
+// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
+#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // herk
 
-#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
-#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
+// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
+#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // trmm
 
-#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
-#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
+// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
+#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
+
+#define bli_trmm_my_iter_rr( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )
 
 // trsm
 
-#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
+#define bli_trsm_my_iter_rr( index, thread ) \
+\
+	( index % thread->n_way == thread->work_id % thread->n_way )
 
 //
 // thrinfo_t APIs specific to level-3 operations.
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index 0c62b69ac..73b8bed06 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var1
 	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index 6a19e1bdb..3c25d7fa8 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var2
 	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 2332a6cf7..975dc8d95 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -53,11 +54,19 @@ cntl_t* bli_gemmbp_cntl_create
        pack_t schema_b
      )
 {
-	void* macro_kernel_p = bli_gemm_ker_var2;
+	void* macro_kernel_fp;
+	void* packa_fp;
+	void* packb_fp;
 
-	// Change the macro-kernel if the operation family is herk or trmm.
-	if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
-	else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
+	// Use the function pointers to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
+	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
+	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
+	else /* should never execute */ macro_kernel_fp = NULL;
+
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;
 
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
@@ -72,7 +81,7 @@ cntl_t* bli_gemmbp_cntl_create
 	(
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
+	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);
 
@@ -80,7 +89,7 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -104,7 +113,7 @@ cntl_t* bli_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_KR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c
index 81552893a..07226388a 100644
--- a/frame/3/gemm/bli_gemm_int.c
+++ b/frame/3/gemm/bli_gemm_int.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -114,7 +115,8 @@ void bli_gemm_int
 
 		if ( im != BLIS_NAT )
 		{
-			if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
+			if ( im == BLIS_4M1B )
+			if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
 		}
 	}
 
diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c
index f7038584a..e60c78a5a 100644
--- a/frame/3/gemm/bli_gemm_ker_var1.c
+++ b/frame/3/gemm/bli_gemm_ker_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,6 +33,8 @@
 
 */
 
+#if 0
+
 #include "blis.h"
 
 void bli_gemm_ker_var1
@@ -55,3 +58,5 @@ void bli_gemm_ker_var1
 	bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread );
 }
 
+#endif
+
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 1aa032ad9..cee050b85 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -132,7 +133,6 @@ void bli_gemm_ker_var2
 	// real-valued beta, we can use the real domain macro-kernel, which
 	// eliminates a little overhead associated with the 1m virtual
 	// micro-kernel.
-#if 1
 	if ( bli_cntx_method( cntx ) == BLIS_1M )
 	{
 		bli_l3_ind_recast_1m_params
@@ -146,7 +146,6 @@ void bli_gemm_ker_var2
 		  rs_c, cs_c
 		);
 	}
-#endif
 
 #ifdef BLIS_ENABLE_GEMM_MD
 	// Tweak parameters in select mixed domain cases cases.
@@ -300,17 +299,29 @@ void PASTEMAC(ch,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	/* Save the desired output datatype (indicating no typecasting). */ \
-	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -325,7 +336,7 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -335,12 +346,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c
index e414722b9..e52aa7f9e 100644
--- a/frame/3/gemm/bli_gemm_ker_var2_md.c
+++ b/frame/3/gemm/bli_gemm_ker_var2_md.c
@@ -273,14 +273,29 @@ void PASTEMAC2(chc,che,varname) \
 	bli_auxinfo_set_is_a( is_a, &aux ); \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype_e* restrict a1; \
 		ctype_c* restrict c11; \
@@ -295,7 +310,7 @@ void PASTEMAC2(chc,che,varname) \
 		b2 = b1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype_e* restrict a2; \
 \
@@ -305,12 +320,12 @@ void PASTEMAC2(chc,che,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 9baee6187..61a8136ec 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -58,6 +59,7 @@ GENPROT( gemm_packa )
 GENPROT( gemm_packb )
 
 GENPROT( gemm_ker_var1 )
+
 GENPROT( gemm_ker_var2 )
 
 // Headers for induced algorithms:
diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
index 878889d2a..08992145a 100644
--- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
+++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
 	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
 	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_inc = jr_num_threads; \
+	dim_t ir_inc = ir_num_threads; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
@@ -295,12 +299,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
 \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c
new file mode 100644
index 000000000..b48f46bc0
--- /dev/null
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -0,0 +1,366 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
+
+
+void bli_gemm_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+    // If 1m is being employed on a column- or row-stored matrix with a
+    // real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+#if 1
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+#endif
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
new file mode 100644
index 000000000..3cb108eea
--- /dev/null
+++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
@@ -0,0 +1,380 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_gemm_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+    // If 1m is being employed on a column- or row-stored matrix with a
+    // real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
+
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
new file mode 100644
index 000000000..3e9e28835
--- /dev/null
+++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
@@ -0,0 +1,380 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_gemm_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// If 1m is being employed on a column- or row-stored matrix with a
+	// real-valued beta, we can use the real domain macro-kernel, which
+	// eliminates a little overhead associated with the 1m virtual
+	// micro-kernel.
+	if ( bli_is_1m_packed( schema_a ) )
+	{
+		bli_l3_ind_recast_1m_params
+		(
+		  dt_exec,
+		  schema_a,
+		  c,
+		  m, n, k,
+		  pd_a, ps_a,
+		  pd_b, ps_b,
+		  rs_c, cs_c
+		);
+	}
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
+
diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c
index f45542d37..8dd94efbc 100644
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/bli_herk_l_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -282,17 +283,57 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the initial rectangular region of C (if it exists).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -307,7 +348,113 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -320,12 +467,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c
index 3061a5c39..53f27cb92 100644
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/bli_herk_u_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -117,7 +118,7 @@ void bli_herk_u_ker_var2
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
+    f = ftypes[dt_exec];
 
 	// Invoke the function.
 	f( diagoffc,
@@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \
 \
 	/* If there is a zero region to the left of where the diagonal of C
 	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero. */ \
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
 	if ( diagoffc > 0 ) \
 	{ \
 		jp       = diagoffc / NR; \
@@ -282,17 +285,56 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and the default (slab or rr) partitioning in the 1st loop for the
+	   initial triangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -307,7 +349,7 @@ void PASTEMAC(ch,varname) \
 		b2 = b1; \
 \
 		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -320,12 +362,12 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -405,6 +447,114 @@ void PASTEMAC(ch,varname) \
 			} \
 		} \
 	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Determine the thread range and increment for the 2nd loop of the
+	   remaining rectangular region of C (and also use default partitioning
+	   for the 1st loop).
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
 }
 
 INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h
index 58061a8dd..d7cb75943 100644
--- a/frame/3/herk/bli_herk_var.h
+++ b/frame/3/herk/bli_herk_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,6 +57,7 @@ void PASTEMAC0(opname) \
 //GENPROT( herk_blk_var3 )
 
 GENPROT( herk_x_ker_var2 )
+
 GENPROT( herk_l_ker_var2 )
 GENPROT( herk_u_ker_var2 )
 //GENPROT( herk_packa )
diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/herk/bli_herk_x_ker_var2.c
index 10b6ab826..1dc95772a 100644
--- a/frame/3/herk/bli_herk_x_ker_var2.c
+++ b/frame/3/herk/bli_herk_x_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
new file mode 100644
index 000000000..bd7b69e81
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
@@ -0,0 +1,420 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+
+
+void bli_herk_l_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd and 1st loops. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/herk/other/bli_herk_l_ker_var2.c
new file mode 100644
index 000000000..832421813
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_l_ker_var2.c
@@ -0,0 +1,409 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+
+
+void bli_herk_l_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c
new file mode 100644
index 000000000..7393f8e1b
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_l_ker_var2rr.c
@@ -0,0 +1,555 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_herk_l_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr )
+
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c
new file mode 100644
index 000000000..569684bf7
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_l_ker_var2sl.c
@@ -0,0 +1,556 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_herk_l_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, ip; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely above the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region above where the diagonal of C intersects
+	   the left edge of the panel, adjust the pointer to C and A and treat
+	   this case as if the diagonal offset were zero. */ \
+	if ( diagoffc < 0 ) \
+	{ \
+		ip       = -diagoffc / MR; \
+		i        = ip * MR; \
+		m        = m - i; \
+		diagoffc = -diagoffc % MR; \
+		c_cast   = c_cast + (i  )*rs_c; \
+		a_cast   = a_cast + (ip )*ps_a; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of C intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffc + m < n ) \
+	{ \
+		n = diagoffc + m; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of C, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. Any remainder from this integer division is discarded, which
+		   is what we want. That is, we want the rectangular region to contain
+		   as many columns of whole microtiles as possible without including any
+		   microtiles that intersect the diagonal. The number of iterations in
+		   the triangular (or trapezoidal) region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffc / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd
+	   loop and slab partitioning in the 1st loop for the remaining
+	   triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the triangular region
+	   by the number of iterations used for the rectangular region. */ \
+	jr_start += n_iter_rct; \
+	jr_end   += n_iter_rct; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly below the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly above the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl )
+
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
new file mode 100644
index 000000000..398213282
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
@@ -0,0 +1,420 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+
+
+void bli_herk_u_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero. */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Use interleaved (round robin) assignment of micropanels to threads in
+	   the 2nd and 1st loops. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/herk/other/bli_herk_u_ker_var2.c
new file mode 100644
index 000000000..8d1a3021d
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_u_ker_var2.c
@@ -0,0 +1,409 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+
+
+void bli_herk_u_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero. */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
+	dim_t jr_num_threads = bli_thread_n_way( thread ); \
+	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
+	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
+	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c
new file mode 100644
index 000000000..e0ac82745
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_u_ker_var2rr.c
@@ -0,0 +1,557 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_herk_u_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+    f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial triangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr )
+
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c
new file mode 100644
index 000000000..b182561d7
--- /dev/null
+++ b/frame/3/herk/other/bli_herk_u_ker_var2sl.c
@@ -0,0 +1,558 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T herk_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffc,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_herk_u_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffc  = bli_obj_diag_offset( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+    f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffc,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffc, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffc_ij; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           i, j, jp; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of C is entirely below the diagonal,
+	   it is not stored. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
+\
+	/* If there is a zero region to the left of where the diagonal of C
+	   intersects the top edge of the panel, adjust the pointer to C and B
+	   and treat this case as if the diagonal offset were zero.
+	   NOTE: It's possible that after this pruning that the diagonal offset
+	   is still positive (though it is guaranteed to be less than NR). */ \
+	if ( diagoffc > 0 ) \
+	{ \
+		jp       = diagoffc / NR; \
+		j        = jp * NR; \
+		n        = n - j; \
+		diagoffc = diagoffc % NR; \
+		c_cast   = c_cast + (j  )*cs_c; \
+		b_cast   = b_cast + (jp )*ps_b; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of C intersects
+	   the right edge of the panel, shrink it to prevent "no-op" iterations
+	   from executing. */ \
+	if ( -diagoffc + n < m ) \
+	{ \
+		m = -diagoffc + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
+	{ \
+		/* If the entire panel of C does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of C does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in C. A non-zero remainder means we need to
+		   add one additional iteration. That is, we want the triangular region
+		   to contain as few columns of whole microtiles as possible while still
+		   including all microtiles that intersect the diagonal. The number of
+		   iterations in the rectangular region is computed as the remaining
+		   number of iterations in the n dimension. */ \
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   and slab partitioning in the 1st loop for the initial triangular region
+	   of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* Compute the diagonal offset for the submatrix at (i,j). */ \
+			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale C and add the result to only the stored part. */ \
+				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
+				                          m_cur, n_cur, \
+				                          ct,  rs_ct, cs_ct, \
+				                          beta_cast, \
+				                          c11, rs_c,  cs_c ); \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st loops
+	   loop for the remaining triangular region of C. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+	   by the number of iterations used for the triangular region. */ \
+	jr_start += n_iter_tri; \
+	jr_end   += n_iter_tri; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Interior loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			/* No need to compute the diagonal offset for the rectangular
+			   region. */ \
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* If the diagonal intersects the current MR x NR submatrix, we
+			   compute it the temporary buffer and then add in the elements
+			   on or below the diagonal.
+			   Otherwise, if the submatrix is strictly above the diagonal,
+			   we compute and store as we normally would.
+			   And if we're strictly below the diagonal, we do nothing and
+			   continue. */ \
+			{ \
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Scale the edge of C and add the result. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        beta_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl )
+
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index 3778c7302..4d6b49a25 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -85,6 +86,10 @@ void bli_trmm_front
 	}
 
 #if 0
+	// NOTE: This case casts right-side trmm in terms of left side. This
+	// reduces the number of macrokernels exercised to two (trmm_ll and
+	// trmm_lu) but can lead to the microkernel being executed with an
+	// output matrix that is stored counter to its output preference.
 
 	// If A is being multiplied from the right, transpose all operands
 	// so that we can perform the computation as if A were being multiplied
@@ -98,6 +103,11 @@ void bli_trmm_front
 	}
 
 #else
+	// NOTE: This case computes right-side trmm natively with trmm_rl and
+	// trmm_ru macrokernels. This code path always gives us the opportunity
+	// to transpose the entire operation so that the effective storage format
+	// of the output matrix matches the microkernel's output preference.
+	// Thus, from a performance perspective, this case is preferred.
 
 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index eef104eed..a9df2571a 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
        rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -320,29 +321,45 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+       NOTE: The definition of bli_thread_range_jrir() will depend on whether
+       slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -372,17 +389,18 @@ void PASTEMAC(ch,varname) \
 				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
 				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
 \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -436,23 +454,24 @@ void PASTEMAC(ch,varname) \
 					                        ct,  rs_ct, cs_ct, \
 					                        c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += ps_a_cur; \
 			} \
 			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -501,17 +520,13 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += rstep_a; \
 			} \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 23dd22cb8..bb843c84d 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
        rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -327,29 +328,45 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -379,17 +396,18 @@ void PASTEMAC(ch,varname) \
 				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
 				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
 \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -443,23 +461,24 @@ void PASTEMAC(ch,varname) \
 					                        ct,  rs_ct, cs_ct, \
 					                        c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += ps_a_cur; \
 			} \
 			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
 			{ \
-				if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
 \
 				ctype* restrict a2; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -508,17 +527,13 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
+				/*}*/ \
 \
 				a1 += rstep_a; \
 			} \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index ae44e8ff9..e03de3e08 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
        rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -327,15 +328,152 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
 \
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the initial rectangular region of B (if it exists).
+       NOTE: The definition of bli_thread_range_jrir() will depend on whether
+       slab or round-robin partitioning was requested at configure-time. \
+       NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -361,7 +499,6 @@ void PASTEMAC(ch,varname) \
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
 		   and trmm3. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 		{ \
 			/* Compute the panel stride for the current diagonal-
 			   intersecting micro-panel. */ \
@@ -369,7 +506,7 @@ void PASTEMAC(ch,varname) \
 			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
 			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
 \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
+			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
 \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
@@ -378,7 +515,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
 \
 				ctype* restrict a1_i; \
 				ctype* restrict a2; \
@@ -389,11 +526,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -452,83 +589,6 @@ void PASTEMAC(ch,varname) \
 \
 			b1 += ps_b_cur; \
 		} \
-		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
-\
-			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
-						b2 = b_cast; \
-				} \
-\
-				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  one, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Add the result to the edge of C. */ \
-					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
-					                       ct,  rs_ct, cs_ct, \
-					                       c11, rs_c,  cs_c ); \
-				} \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += cstep_b; \
-		} \
 \
 		c1 += cstep_c; \
 	} \
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index 9d7ec4cfe..5261bf13f 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
        rntm_t* rntm, \
-       thrinfo_t* jr_thread  \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t     dt         = PASTEMAC(ch,type); \
@@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \
 	dim_t           n_cur; \
 	dim_t           k_b0111; \
 	dim_t           off_b0111; \
-	dim_t           i, j; \
+	dim_t           i, j, jb0; \
 	inc_t           rstep_a; \
 	inc_t           cstep_b; \
 	inc_t           rstep_c, cstep_c; \
@@ -327,16 +328,58 @@ void PASTEMAC(ch,varname) \
 \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and
+	   1st loops for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
-	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
-	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = 0; j < n_iter_tri; ++j ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -361,7 +404,6 @@ void PASTEMAC(ch,varname) \
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
 		   and trmm3. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 		{ \
 			/* Compute the panel stride for the current diagonal-
 			   intersecting micro-panel. */ \
@@ -369,7 +411,7 @@ void PASTEMAC(ch,varname) \
 			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
 			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
 \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
+			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
 \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
@@ -378,7 +420,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
 \
 				ctype* restrict a1_i; \
 				ctype* restrict a2; \
@@ -389,11 +431,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -452,30 +494,75 @@ void PASTEMAC(ch,varname) \
 \
 			b1 += ps_b_cur; \
 		} \
-		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
 \
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops for
+	   the remaining rectangular region of B.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. \
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
 			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
 			   object. */ \
 			bli_auxinfo_set_is_b( istep_b, &aux ); \
 \
 			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
 			{ \
-				if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
-\
 				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
 \
 				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 				{ \
 					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -520,19 +607,12 @@ void PASTEMAC(ch,varname) \
 					                       ct,  rs_ct, cs_ct, \
 					                       c11, rs_c,  cs_c ); \
 				} \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
 			} \
-			} \
-\
-			b1 += cstep_b; \
 		} \
-\
-		c1 += cstep_c; \
 	} \
 \
+\
+\
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index bde7977b5..4355fed71 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,6 +57,7 @@ void PASTEMAC0(opname) \
 //GENPROT( trmm_blk_var3 )
 
 GENPROT( trmm_xx_ker_var2 )
+
 GENPROT( trmm_ll_ker_var2 )
 GENPROT( trmm_lu_ker_var2 )
 GENPROT( trmm_rl_ker_var2 )
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index d0e157877..df12c25ac 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
new file mode 100644
index 000000000..fbbbb7b2f
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
@@ -0,0 +1,519 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
+
+
+void bli_trmm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
new file mode 100644
index 000000000..a940fdb6f
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
@@ -0,0 +1,535 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_ll_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). 
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr )
+
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
new file mode 100644
index 000000000..718c6fba1
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
@@ -0,0 +1,535 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trmm_ll_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           off_a1011; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists).
+	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly below the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1011 = 0; \
+				k_a1011   = bli_min( diagoffa_i + MR, k ); \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1011, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl )
+
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
new file mode 100644
index 000000000..2fe01d0e2
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
@@ -0,0 +1,527 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
+
+
+void bli_trmm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
new file mode 100644
index 000000000..ab1efa46d
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
@@ -0,0 +1,542 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_lu_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr )
+
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
new file mode 100644
index 000000000..1bb4e1b6d
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
@@ -0,0 +1,542 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trmm_lu_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           off_a1112; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current block of A is entirely below the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
+	else                                      { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
+\
+	dim_t jr_start, jr_end; \
+	/*dim_t ir_start, ir_end;*/ \
+	dim_t jr_inc; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd loop for
+	   the initial rectangular region of C (if it exists). */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, scale C
+			   by beta. If it is strictly above the diagonal, scale by one.
+			   This allows the current macro-kernel to work for both trmm
+			   and trmm3. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict b1_i; \
+				ctype* restrict a2; \
+\
+				/* Determine the offset to and length of the panel that was
+				   packed so we can index into the corresponding location in
+				   b1. */ \
+				off_a1112 = diagoffa_i; \
+				k_a1112   = k - off_a1112; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_a1112, \
+					  alpha_cast, \
+					  a1, \
+					  b1_i, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				/* NOTE: ir loop parallelism disabled for now. */ \
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
+\
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				/*}*/ \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl )
+
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
new file mode 100644
index 000000000..860295c4c
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
@@ -0,0 +1,539 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
+
+
+void bli_trmm_rl_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
new file mode 100644
index 000000000..1b1549951
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
@@ -0,0 +1,598 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_rl_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of B (if it exists). */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr )
+
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
new file mode 100644
index 000000000..80e9c7f2f
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
@@ -0,0 +1,598 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trmm_rl_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           off_b1121; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above the diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it to prevent
+	   "no-op" iterations from executing. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the rectangular
+	   part of B, and the triangular portion. */ \
+	dim_t n_iter_rct; \
+	dim_t n_iter_tri; \
+\
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the second set of
+		   loops. */ \
+		n_iter_rct = n_iter; \
+		n_iter_tri = 0; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the rectangular region by dividing NR into the diagonal
+		   offset. (There should never be any remainder in this division.) The
+		   number of iterations in the triangular (or trapezoidal) region is
+		   computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_rct = diagoffb / NR; \
+		n_iter_tri = n_iter - n_iter_rct; \
+	} \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops for the initial rectangular region of B (if it exists). */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+	/* If there is no triangular region, then we're done. */ \
+	if ( n_iter_tri == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the remaining triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	/* Advance the starting b1 and c1 pointers to the positions corresponding
+	   to the start of the triangular region of B. */ \
+	jr_start = n_iter_rct; \
+	b1 = b_cast + jr_start * cstep_b; \
+	c1 = c_cast + jr_start * cstep_c; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to the beginning of the panel that
+		   was packed so we can index into the corresponding location
+		   in A. Then compute the length of that panel. */ \
+		off_b1121 = bli_max( -diagoffb_j, 0 ); \
+		k_b1121   = k - off_b1121; \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b1121, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl )
+
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
new file mode 100644
index 000000000..e0adf4cf2
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -0,0 +1,539 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
+
+
+void bli_trmm_ru_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* jr_thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	thrinfo_t* ir_thread      = bli_thrinfo_sub_node( jr_thread ); \
+	dim_t jr_num_threads      = bli_thread_n_way( jr_thread ); \
+	dim_t jr_thread_id        = bli_thread_work_id( jr_thread ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			if ( bli_trmm_my_iter( j, jr_thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, ir_thread ) ) { \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
new file mode 100644
index 000000000..ff118ab6d
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
@@ -0,0 +1,618 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trmm_ru_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j, jb0; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter_tri; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
+	   loops the remaining triangular region of B. */ \
+	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+\
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr )
+
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
new file mode 100644
index 000000000..0fc2d514a
--- /dev/null
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
@@ -0,0 +1,618 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trmm_ru_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	FUNCPTR_T f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	const dim_t     PACKMR     = cs_a; \
+	const dim_t     PACKNR     = rs_b; \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           off_b0111; \
+	dim_t           i, j, jb0; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full. For all trmm, k_full is simply k. This is
+	   needed because some parameter combinations of trmm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of A (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = k; \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. And if we are packing real-only, imag-only, or
+	   summed-only, we need to scale the computed panel sizes by 1/2
+	   to compensate for the fact that the pointer arithmetic occurs
+	   in terms of complex elements rather than real elements. */ \
+	if      ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
+	else                                      { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Note that we partition the 2nd loop into two regions: the triangular
+	   part of C, and the rectangular portion. */ \
+	dim_t n_iter_tri; \
+	dim_t n_iter_rct; \
+\
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
+	{ \
+		/* If the entire panel of B does not intersect the diagonal, there is
+		   no triangular region, and therefore we can skip the first set of
+		   loops. */ \
+		n_iter_tri = 0; \
+		n_iter_rct = n_iter; \
+	} \
+	else \
+	{ \
+		/* If the panel of B does intersect the diagonal, compute the number of
+		   iterations in the triangular (or trapezoidal) region by dividing NR
+		   into the number of rows in B. (There should never be any remainder
+		   in this division.) The number of iterations in the rectangular region
+		   is computed as the remaining number of iterations in the n dimension. */ \
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
+		n_iter_rct = n_iter - n_iter_tri; \
+	} \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop
+	   for the initial triangular region of B (if it exists).
+	   NOTE: We don't need to call bli_thread_range_jrir*() here since we
+	   employ a hack that calls for each thread to execute every iteration
+	   of the jr and ir loops but skip all but the pointer increment for
+	   iterations that are not assigned to it. */ \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter_tri; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+\
+		/* Determine the offset to and length of the panel that was packed
+		   so we can index into the corresponding location in A. */ \
+		off_b0111 = 0; \
+		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+\
+		a1  = a_cast; \
+		c11 = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Compute the panel stride for the current diagonal-
+			   intersecting micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			if ( bli_trmm_my_iter( j, thread ) ) { \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if ( bli_trmm_my_iter( i, caucus ) ) { \
+\
+				ctype* restrict a1_i; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Copy edge elements of C to the temporary buffer. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        c11, rs_c,  cs_c, \
+					                        ct,  rs_ct, cs_ct ); \
+\
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k_b0111, \
+					  alpha_cast, \
+					  a1_i, \
+					  b1, \
+					  beta_cast, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+\
+	/* If there is no rectangular region, then we're done. */ \
+	if ( n_iter_rct == 0 ) return; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd and 1st
+	   loops the remaining triangular region of B. */ \
+	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Advance the start and end iteration offsets for the rectangular region
+       by the number of iterations used for the triangular region. */ \
+    jr_start += n_iter_tri; \
+    jr_end   += n_iter_tri; \
+	jb0       = n_iter_tri; \
+\
+	/* Save the resulting value of b1 from the previous loop since it represents
+	   the starting point for the rectangular region. */ \
+	b_cast = b1; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		/* NOTE: We must index through b_cast differently since it contains
+		   the starting address of the rectangular region (which is already
+		   n_iter_tri logical iterations through B). */ \
+		b1 = b_cast + (j-jb0) * cstep_b; \
+        c1 = c_cast +  j      * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, scale C
+		   by beta. If it is strictly below the diagonal, scale by one.
+		   This allows the current macro-kernel to work for both trmm
+		   and trmm3. */ \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = ir_start; i < ir_end; i += ir_inc ) \
+			{ \
+				ctype* restrict a2; \
+\
+				a1  = a_cast + i * rstep_a; \
+				c11 = c1     + i * rstep_c; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+				if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  one, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  alpha_cast, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
+					                       ct,  rs_ct, cs_ct, \
+					                       c11, rs_c,  cs_c ); \
+				} \
+			} \
+		} \
+	} \
+\
+\
+\
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl )
+
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 8b666b3f4..783572944 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var1
 	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
       &my_start, &my_end
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 6be5965a3..7286ba7e0 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var2
 	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  direct, thread, a, b, c, cntl, cntx,
       &my_start, &my_end
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index ee40189e5..24f8f37bf 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -53,7 +54,16 @@ cntl_t* bli_trsm_l_cntl_create
        pack_t schema_b
      )
 {
-	void* macro_kernel_p = bli_trsm_xx_ker_var2;
+	void* macro_kernel_p;
+	void* packa_fp;
+	void* packb_fp;
+
+	// Use the function pointer to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_p = bli_trsm_xx_ker_var2;
+
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;
 
 	const opid_t family = BLIS_TRSM;
 
@@ -78,7 +88,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packa,
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_MR,
 	  TRUE,    // do NOT invert diagonal
@@ -102,7 +112,7 @@ cntl_t* bli_trsm_l_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packb,
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_MR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
@@ -140,8 +150,12 @@ cntl_t* bli_trsm_r_cntl_create
        pack_t schema_b
      )
 {
+	// NOTE: trsm macrokernels are presently disabled for right-side execution.
 	void* macro_kernel_p = bli_trsm_xx_ker_var2;
 
+	void* packa_fp = bli_packm_blk_var1;
+	void* packb_fp = bli_packm_blk_var1;
+
 	const opid_t family = BLIS_TRSM;
 
 	// Create two nodes for the macro-kernel.
@@ -165,7 +179,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packa,
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_NR,
 	  BLIS_MR,
 	  FALSE,   // do NOT invert diagonal
@@ -189,7 +203,7 @@ cntl_t* bli_trsm_r_cntl_create
 	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
 	(
 	  bli_trsm_packb,
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_MR,
 	  BLIS_MR,
 	  TRUE,    // do NOT invert diagonal
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 021f8baf2..c561de93d 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -342,25 +343,42 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if( bli_trsm_my_iter( j, thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1 + (0  )*rstep_c; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( i = 0; i < m_iter; ++i ) \
@@ -408,12 +426,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -473,12 +490,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -534,10 +550,6 @@ void PASTEMAC(ch,varname) \
 \
 			c11 += rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 0ddcd16d4..6db5c6569 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -350,25 +351,42 @@ void PASTEMAC(ch,varname) \
 	/* Save the desired output datatype (indicating no typecasting). */ \
 	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	b1 = b_cast; \
-	c1 = c_cast; \
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Determine the thread range and increment for the 2nd loop.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
-		if( bli_trsm_my_iter( j, thread ) ) { \
-\
 		ctype* restrict a1; \
 		ctype* restrict c11; \
 		ctype* restrict b2; \
 \
-		a1  = a_cast; \
-		c11 = c1 + (m_iter-1)*rstep_c; \
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
 		for ( ib = 0; ib < m_iter; ++ib ) \
@@ -418,12 +436,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -483,12 +500,11 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1; \
-					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
-					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -544,10 +560,6 @@ void PASTEMAC(ch,varname) \
 \
 			c11 -= rstep_c; \
 		} \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
 	} \
 \
 /*
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 1cf456678..f69f5471d 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -430,7 +431,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a11; \
 				ctype* restrict a12; \
@@ -444,12 +445,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -516,7 +517,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a2; \
 \
@@ -524,12 +525,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index b5a76d03a..2f3071d61 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -423,7 +424,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a10; \
 				ctype* restrict a11; \
@@ -437,12 +438,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
@@ -509,7 +510,7 @@ void PASTEMAC(ch,varname) \
 			/* Loop over the m dimension (MR rows at a time). */ \
 			for ( i = 0; i < m_iter; ++i ) \
 			{ \
-				if( bli_trsm_my_iter( i, thread ) ){ \
+				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
 \
 				ctype* restrict a2; \
 \
@@ -517,12 +518,12 @@ void PASTEMAC(ch,varname) \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1; \
-				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
 				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
 				{ \
 					a2 = a_cast; \
 					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
 						b2 = b_cast; \
 				} \
 \
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 5ac72c28c..ebd7afc2a 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -58,6 +59,7 @@ GENPROT( trsm_packa )
 GENPROT( trsm_packb )
 
 GENPROT( trsm_xx_ker_var2 )
+
 GENPROT( trsm_ll_ker_var2 )
 GENPROT( trsm_lu_ker_var2 )
 GENPROT( trsm_rl_ker_var2 )
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index 24d55af24..c8527f647 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
new file mode 100644
index 000000000..4e7e1b850
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
@@ -0,0 +1,593 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
+
+
+void bli_trsm_ll_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if( bli_trsm_my_iter( j, thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
+
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
new file mode 100644
index 000000000..844d76ab7
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
@@ -0,0 +1,605 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trsm_ll_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr )
+
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
new file mode 100644
index 000000000..e67de28fe
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
@@ -0,0 +1,605 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trsm_ll_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1011; \
+	dim_t           k_a10; \
+	dim_t           off_a10; \
+	dim_t           off_a11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of A intersects the
+	   left edge of the block, adjust the pointer to C and treat this case as
+	   if the diagonal offset were zero. This skips over the region that was
+	   not packed. (Note we assume the diagonal offset is a multiple of MR;
+	   this assumption will hold as long as the cache blocksizes are each a
+	   multiple of MR and NR.) */ \
+	if ( diagoffa < 0 ) \
+	{ \
+		i        = -diagoffa; \
+		m        = m - i; \
+		diagoffa = 0; \
+		c_cast   = c_cast + (i  )*rs_c; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (0  )*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = 0; i < m_iter; ++i ) \
+		{ \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides below the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is above the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict b01; \
+				ctype* restrict b11; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a10 = 0; \
+				k_a1011 = diagoffa_i + MR; \
+				k_a10   = k_a1011 - MR; \
+				off_a11 = k_a10; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1011 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the panel A10 and the triangular
+				   block A11. */ \
+				a10 = a1; \
+				/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
+				a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a10, \
+					  alpha1_cast, \
+					  a10, \
+					  a11, \
+					  b01, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 += rstep_c; \
+		} \
+	} \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+*/ \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
+*/ \
+\
+/*
+if ( bli_is_4mi_packed( schema_a ) ){ \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,    rs_b, 1, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+72, rs_b, 1, "%4.1f", "" ); \
+}else{ \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
+                     ( double* )b,   2*rs_b, 2, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
+                     ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
+} \
+
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
+                     ( double* )c,    1, cs_c, "%4.1f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
+                     ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl )
+
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
new file mode 100644
index 000000000..a8978df86
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
@@ -0,0 +1,574 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
+
+
+void bli_trsm_lu_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		if( bli_trsm_my_iter( j, thread ) ) { \
+\
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
+					if ( j + bli_thread_num_threads(thread) >= n_iter ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+		} \
+\
+		b1 += cstep_b; \
+		c1 += cstep_c; \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
+
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
new file mode 100644
index 000000000..3d2792508
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
@@ -0,0 +1,586 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr);
+
+//
+// -- Macrokernel functions for round-robin partitioning -----------------------
+//
+
+void bli_trsm_lu_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use round-robin assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr )
+
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
new file mode 100644
index 000000000..486294352
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
@@ -0,0 +1,586 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffa,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl);
+
+//
+// -- Macrokernel functions for slab partitioning ------------------------------
+//
+
+void bli_trsm_lu_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffa  = bli_obj_diag_offset( a );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to B (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of B prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffa,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffa, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffa_i; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_a1112; \
+	dim_t           k_a11; \
+	dim_t           k_a12; \
+	dim_t           off_a11; \
+	dim_t           off_a12; \
+	dim_t           i, j, ib; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_a_num; \
+	inc_t           ss_a_den; \
+	inc_t           ps_a_cur; \
+	inc_t           is_a_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
+	   So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of MR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_a ) || \
+	     bli_is_3mi_packed( schema_a ) || \
+	     bli_is_rih_packed( schema_a ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
+	else                                 { ss_a_num = 1; ss_a_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of A
+	   intersects the top edge of the block, adjust the pointer to B and
+	   treat this case as if the diagonal offset were zero. Note that we
+	   don't need to adjust the pointer to A since packm would have simply
+	   skipped over the region that was not stored. */ \
+	if ( diagoffa > 0 ) \
+	{ \
+		i        = diagoffa; \
+		k        = k - i; \
+		diagoffa = 0; \
+		b_cast   = b_cast + ( i * PACKNR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of A intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffa + k < m ) \
+	{ \
+		m = -diagoffa + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of MR. If k
+	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an MR x MR triangular solve.
+	   This adjustment of k is consistent with what happened when A was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of B. */ \
+	if ( k % MR != 0 ) k += MR - ( k % MR ); \
+\
+	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
+	   know that the underlying buffer was already allocated to have an m
+	   dimension that is a multiple of PACKMR, with the region between the
+	   last row and the next multiple of MR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k; \
+	istep_b = PACKNR * k_full; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_b( istep_b, &aux ); \
+\
+	/* We don't bother querying the thrinfo_t node for the 1st loop because
+	   we can't parallelize that loop in trsm due to the inter-iteration
+	   dependencies that exist. */ \
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t jr_inc; \
+\
+	/* Use slab assignment of micropanels to threads in the 2nd loop.
+	   NOTE: Parallelism in the 1st loop is unattainable due to the
+	   inter-iteration dependencies present in trsm. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		a1  = a_cast; \
+		c11 = c1 + (m_iter-1)*rstep_c; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( ib = 0; ib < m_iter; ++ib ) \
+		{ \
+			i          = m_iter - 1 - ib; \
+			diagoffa_i = diagoffa + ( doff_t )i*MR; \
+\
+			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* If the current panel of A intersects the diagonal, use a
+			   special micro-kernel that performs a fused gemm and trsm.
+			   If the current panel of A resides above the diagonal, use a
+			   a regular gemm micro-kernel. Otherwise, if it is below the
+			   diagonal, it was not packed (because it is implicitly zero)
+			   and so we do nothing. */ \
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict b11; \
+				ctype* restrict b21; \
+				ctype* restrict a2; \
+\
+				/* Compute various offsets into and lengths of parts of A. */ \
+				off_a11 = diagoffa_i; \
+				k_a1112 = k - off_a11;; \
+				k_a11   = MR; \
+				k_a12   = k_a1112 - MR; \
+				off_a12 = off_a11 + k_a11; \
+\
+				/* Compute the panel stride for the current diagonal-
+				   intersecting micro-panel. */ \
+				is_a_cur  = k_a1112 * PACKMR; \
+				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
+				ps_a_cur  = ( is_a_cur * ss_a_num ) / ss_a_den; \
+\
+				/* Compute the addresses of the triangular block A11 and the
+				   panel A12. */ \
+				a11 = a1; \
+				/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
+				a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
+\
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
+				b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + ps_a_cur; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( is_a_cur, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_a12, \
+					  alpha1_cast, \
+					  a12, \
+					  a11, \
+					  b21, \
+					  b11, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += ps_a_cur; \
+			} \
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
+			{ \
+				ctype* restrict a2; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1 + rstep_a; \
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1; \
+					if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_next_a( a2, &aux ); \
+				bli_auxinfo_set_next_b( b2, &aux ); \
+\
+				/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
+				   object. */ \
+				bli_auxinfo_set_is_a( istep_a, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  alpha2_cast, \
+					  c11, rs_c, cs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  a1, \
+					  b1, \
+					  zero, \
+					  ct, rs_ct, cs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+\
+				a1 += rstep_a; \
+			} \
+\
+			c11 -= rstep_c; \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
+printf( "m_iter     = %lu\n", m_iter ); \
+printf( "m_cur      = %lu\n", m_cur ); \
+printf( "k          = %lu\n", k ); \
+printf( "diagoffa_i = %lu\n", diagoffa_i ); \
+printf( "off_a1112  = %lu\n", off_a1112 ); \
+printf( "k_a1112    = %lu\n", k_a1112 ); \
+printf( "k_a12      = %lu\n", k_a12 ); \
+printf( "k_a11      = %lu\n", k_a11 ); \
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
+*/ \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
+*/ \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl )
+
diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
new file mode 100644
index 000000000..70b3e456d
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
@@ -0,0 +1,591 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
+
+
+void bli_trsm_rl_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to A (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of A prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
+	   the current macro-kernel targets the "rl" case (right-side/lower-
+	   triangular), it becomes upper-triangular after the kernel operation
+	   is transposed so that all kernel instances are of the "left"
+	   variety (since those are the only trsm ukernels that exist). */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b1121; \
+	dim_t           k_b11; \
+	dim_t           k_b21; \
+	dim_t           off_b11; \
+	dim_t           off_b21; \
+	dim_t           i, j, jb; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKNR
+	     pd_a == NR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKMR
+	     cs_b == 1
+	     pd_b == MR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+
+	  Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
+	  swapping of values in the control tree (ie: those values used when
+	  packing). This swapping is needed since we cast right-hand trsm in
+	  terms of transposed left-hand trsm. So, if we're going to be
+	  transposing the operation, then A needs to be packed with NR and B
+	  needs to be packed with MR (remember: B is the triangular matrix in
+	  the right-hand side parameter case).
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely above its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of NR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else                                 { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region above where the diagonal of B intersects
+	   the left edge of the panel, adjust the pointer to A and treat this
+	   case as if the diagonal offset were zero. Note that we don't need to
+	   adjust the pointer to B since packm would have simply skipped over
+	   the region that was not stored. */ \
+	if ( diagoffb < 0 ) \
+	{ \
+		j        = -diagoffb; \
+		k        = k - j; \
+		diagoffb = 0; \
+		a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
+	} \
+\
+	/* If there is a zero region to the right of where the diagonal
+	   of B intersects the bottom of the panel, shrink it so that
+	   we can index to the correct place in C (corresponding to the
+	   part of the panel of B that was packed).
+	   NOTE: This is NOT being done to skip over "no-op" iterations,
+	   as with the trsm_lu macro-kernel. This MUST be done for correct
+	   execution because we use n (via n_iter) to compute diagonal and
+	   index offsets for backwards movement through B. */ \
+	if ( diagoffb + k < n ) \
+	{ \
+		n = diagoffb + k; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of NR. If k
+	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an NR x NR triangular solve.
+	   This adjustment of k is consistent with what happened when B was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of A. */ \
+	if ( k % NR != 0 ) k += NR - ( k % NR ); \
+\
+	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
+	   know that the underlying buffer was already allocated to have an n
+	   dimension that is a multiple of PACKNR, with the region between the
+	   last column and the next multiple of NR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_schema_a( schema_b, &aux ); \
+	bli_auxinfo_set_schema_b( schema_a, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( jb = 0; jb < n_iter; ++jb ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b11; \
+		ctype* restrict b21; \
+		ctype* restrict b2; \
+\
+		j          = n_iter - 1 - jb; \
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+		a1         = a_cast; \
+		c11        = c1 + (n_iter-1)*cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, use a
+		   special micro-kernel that performs a fused gemm and trsm.
+		   If the current panel of B resides below the diagonal, use a
+		   a regular gemm micro-kernel. Otherwise, if it is above the
+		   diagonal, it was not packed (because it is implicitly zero)
+		   and so we do nothing. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Determine the offset to and length of the panel that was packed
+			   so we can index into the corresponding location in A. */ \
+			off_b11   = bli_max( -diagoffb_j, 0 ); \
+			k_b1121   = k - off_b11; \
+			k_b11     = NR; \
+			k_b21     = k_b1121 - NR; \
+			off_b21   = off_b11 + k_b11; \
+\
+			/* Compute the addresses of the triangular block B11 and the
+			   panel B21. */ \
+			b11       = b1; \
+			/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
+			b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
+\
+			/* Compute the panel stride for the current micro-panel. */ \
+			is_b_cur  = k_b1121 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a11; \
+				ctype* restrict a12; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the A11 block and A12 panel. */ \
+				a11  = a1 + ( off_b11 * PACKMR ) / off_scl; \
+				a12  = a1 + ( off_b21 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + ps_b_cur; \
+					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b21, \
+					  alpha1_cast, \
+					  b21, \
+					  b11, \
+					  a12, \
+					  a11, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b21, \
+					  alpha1_cast, \
+					  b21, \
+					  b11, \
+					  a12, \
+					  a11, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + cstep_b; \
+					if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  alpha2_cast, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  zero, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 -= cstep_c; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
+
diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
new file mode 100644
index 000000000..289bb5d9f
--- /dev/null
+++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
@@ -0,0 +1,584 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define FUNCPTR_T gemm_fp
+
+typedef void (*FUNCPTR_T)
+     (
+       doff_t  diagoffb,
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha1,
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
+       void*   alpha2,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
+
+
+void bli_trsm_ru_ker_var2
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	doff_t    diagoffb  = bli_obj_diag_offset( b );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	void*     buf_alpha1;
+	void*     buf_alpha2;
+
+	FUNCPTR_T f;
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to A (the non-triangular matrix). This will be the alpha
+	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
+	// be applied to the packed copy of A prior to it being updated by
+	// the trsm subproblem). This scalar may be unit, if for example it
+	// was applied during packing.
+	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to C. This will be the "beta" scalar used in the gemm-only
+	// subproblems that correspond to micro-panels that do not intersect
+	// the diagonal. We need this separate scalar because it's possible
+	// that the alpha attached to B was reset, if it was applied during
+	// packing.
+	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( diagoffb,
+	   schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha1,
+	   buf_a, cs_a, pd_a, ps_a,
+	   buf_b, rs_b, pd_b, ps_b,
+	   buf_alpha2,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       doff_t  diagoffb, \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha1, \
+       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
+       void*   alpha2, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt          = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR          = pd_a; \
+	const dim_t     NR          = pd_b; \
+	const dim_t     PACKMR      = cs_a; \
+	const dim_t     PACKNR      = rs_b; \
+\
+	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
+	   the current macro-kernel targets the "ru" case (right-side/upper-
+	   triangular), it becomes lower-triangular after the kernel operation
+	   is transposed so that all kernel instances are of the "left"
+	   variety (since those are the only trsm ukernels that exist). */ \
+	PASTECH(ch,gemmtrsm_ukr_ft) \
+	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
+	PASTECH(ch,gemm_ukr_ft) \
+	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero        = PASTEMAC(ch,0); \
+	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
+	ctype* restrict a_cast      = a; \
+	ctype* restrict b_cast      = b; \
+	ctype* restrict c_cast      = c; \
+	ctype* restrict alpha1_cast = alpha1; \
+	ctype* restrict alpha2_cast = alpha2; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	doff_t          diagoffb_j; \
+	dim_t           k_full; \
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	dim_t           k_b0111; \
+	dim_t           k_b01; \
+	dim_t           off_b01; \
+	dim_t           off_b11; \
+	dim_t           i, j; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	inc_t           istep_a; \
+	inc_t           istep_b; \
+	inc_t           off_scl; \
+	inc_t           ss_b_num; \
+	inc_t           ss_b_den; \
+	inc_t           ps_b_cur; \
+	inc_t           is_b_cur; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKNR
+	     pd_a == NR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKMR
+	     cs_b == 1
+	     pd_b == MR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+
+	  Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
+	  swapping of values in the control tree (ie: those values used when
+	  packing). This swapping is needed since we cast right-hand trsm in
+	  terms of transposed left-hand trsm. So, if we're going to be
+	  transposing the operation, then A needs to be packed with NR and B
+	  needs to be packed with MR (remember: B is the triangular matrix in
+	  the right-hand side parameter case).
+	*/ \
+\
+	/* Safety trap: Certain indexing within this macro-kernel does not
+	   work as intended if both MR and NR are odd. */ \
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Safeguard: If the current panel of B is entirely below its diagonal,
+	   it is implicitly zero. So we do nothing. */ \
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
+\
+	/* Compute k_full as k inflated up to a multiple of NR. This is
+	   needed because some parameter combinations of trsm reduce k
+	   to advance past zero regions in the triangular matrix, and
+	   when computing the imaginary stride of B (the non-triangular
+	   matrix), which is used by 4m1/3m1 implementations, we need
+	   this unreduced value of k. */ \
+	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
+\
+	/* Compute indexing scaling factor for for 4m or 3m. This is
+	   needed because one of the packing register blocksizes (PACKMR
+	   or PACKNR) is used to index into the micro-panels of the non-
+	   triangular matrix when computing with a diagonal-intersecting
+	   micro-panel of the triangular matrix. In the case of 4m or 3m,
+	   real values are stored in both sub-panels, and so the indexing
+	   needs to occur in units of real values. The value computed
+	   here is divided into the complex pointer offset to cause the
+	   pointer to be advanced by the correct value. */ \
+	if ( bli_is_4mi_packed( schema_b ) || \
+	     bli_is_3mi_packed( schema_b ) || \
+	     bli_is_rih_packed( schema_b ) ) off_scl = 2; \
+	else                                 off_scl = 1; \
+\
+	/* Compute the storage stride scaling. Usually this is just 1.
+	   However, in the case of interleaved 3m, we need to scale the
+	   offset by 3/2. Note that real-only, imag-only, and summed-only
+	   packing formats are not applicable here since trsm is a two-
+	   operand operation only (unlike trmm, which is capable of three-
+	   operand). */ \
+	if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
+	else                                 { ss_b_num = 1; ss_b_den = 1; } \
+\
+	/* If there is a zero region to the left of where the diagonal of B
+	   intersects the top edge of the panel, adjust the pointer to C and
+	   treat this case as if the diagonal offset were zero. This skips over
+	   the region that was not packed. (Note we assume the diagonal offset
+	   is a multiple of MR; this assumption will hold as long as the cache
+	   blocksizes are each a multiple of MR and NR.) */ \
+	if ( diagoffb > 0 ) \
+	{ \
+		j        = diagoffb; \
+		n        = n - j; \
+		diagoffb = 0; \
+		c_cast   = c_cast + (j  )*cs_c; \
+	} \
+\
+	/* If there is a zero region below where the diagonal of B intersects the
+	   right side of the block, shrink it to prevent "no-op" iterations from
+	   executing. */ \
+	if ( -diagoffb + n < k ) \
+	{ \
+		k = -diagoffb + n; \
+	} \
+\
+	/* Check the k dimension, which needs to be a multiple of NR. If k
+	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
+	   kernel, which is expecting to perform an NR x NR triangular solve.
+	   This adjustment of k is consistent with what happened when B was
+	   packed: all of its bottom/right edges were zero-padded, and
+	   furthermore, the panel that stores the bottom-right corner of the
+	   matrix has its diagonal extended into the zero-padded region (as
+	   identity). This allows the trsm of that bottom-right panel to
+	   proceed without producing any infs or NaNs that would infect the
+	   "good" values of the corresponding block of A. */ \
+	if ( k % NR != 0 ) k += NR - ( k % NR ); \
+\
+	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
+	   know that the underlying buffer was already allocated to have an n
+	   dimension that is a multiple of PACKNR, with the region between the
+	   last column and the next multiple of NR zero-padded accordingly. */ \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+       dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	istep_a = PACKMR * k_full; \
+	istep_b = PACKNR * k; \
+\
+	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
+	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_schema_a( schema_b, &aux ); \
+	bli_auxinfo_set_schema_b( schema_a, &aux ); \
+\
+	/* Save the imaginary stride of A to the auxinfo_t object.
+	   NOTE: We swap the values for A and B since the triangular
+	   "A" matrix is actually contained within B. */ \
+	bli_auxinfo_set_is_b( istep_a, &aux ); \
+\
+	b1 = b_cast; \
+	c1 = c_cast; \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = 0; j < n_iter; ++j ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b01; \
+		ctype* restrict b11; \
+		ctype* restrict b2; \
+\
+		diagoffb_j = diagoffb - ( doff_t )j*NR; \
+		a1         = a_cast; \
+		c11        = c1; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* If the current panel of B intersects the diagonal, use a
+		   special micro-kernel that performs a fused gemm and trsm.
+		   If the current panel of B resides above the diagonal, use a
+		   a regular gemm micro-kernel. Otherwise, if it is below the
+		   diagonal, it was not packed (because it is implicitly zero)
+		   and so we do nothing. */ \
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Determine the offset to and length of the panel that was packed
+			   so we can index into the corresponding location in A. */ \
+			off_b01   = 0; \
+			k_b0111   = bli_min( k, -diagoffb_j + NR ); \
+			k_b01     = k_b0111 - NR; \
+			off_b11   = k_b01; \
+\
+			/* Compute the addresses of the panel B10 and the triangular
+			   block B11. */ \
+			b01       = b1; \
+			/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
+			b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
+\
+			/* Compute the panel stride for the current micro-panel. */ \
+			is_b_cur  = k_b0111 * PACKNR; \
+			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
+			ps_b_cur  = ( is_b_cur * ss_b_num ) / ss_b_den; \
+\
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( is_b_cur, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a10; \
+				ctype* restrict a11; \
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the A10 panel and A11 block. */ \
+				a10  = a1 + ( off_b01 * PACKMR ) / off_scl; \
+				a11  = a1 + ( off_b11 * PACKMR ) / off_scl; \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + ps_b_cur; \
+					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b01, \
+					  alpha1_cast, \
+					  b01, \
+					  b11, \
+					  a10, \
+					  a11, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the fused gemm/trsm micro-kernel. */ \
+					gemmtrsm_ukr \
+					( \
+					  k_b01, \
+					  alpha1_cast, \
+					  b01, \
+					  b11, \
+					  a10, \
+					  a11, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Copy the result to the bottom edge of C. */ \
+					PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += ps_b_cur; \
+		} \
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
+		{ \
+			/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
+			   object.
+			   NOTE: We swap the values for A and B since the triangular
+			   "A" matrix is actually contained within B. */ \
+			bli_auxinfo_set_is_a( istep_b, &aux ); \
+\
+			/* Loop over the m dimension (MR rows at a time). */ \
+			for ( i = 0; i < m_iter; ++i ) \
+			{ \
+				if( bli_trsm_my_iter( i, thread ) ){ \
+\
+				ctype* restrict a2; \
+\
+				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+				/* Compute the addresses of the next panels of A and B. */ \
+				a2 = a1; \
+				/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
+				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
+				{ \
+					a2 = a_cast; \
+					b2 = b1 + cstep_b; \
+					if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
+						b2 = b_cast; \
+				} \
+\
+				/* Save addresses of next panels of A and B to the auxinfo_t
+				   object. NOTE: We swap the values for A and B since the
+				   triangular "A" matrix is actually contained within B. */ \
+				bli_auxinfo_set_next_a( b2, &aux ); \
+				bli_auxinfo_set_next_b( a2, &aux ); \
+\
+				/* Handle interior and edge cases separately. */ \
+				if ( m_cur == MR && n_cur == NR ) \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  alpha2_cast, \
+					  c11, cs_c, rs_c, \
+					  &aux, \
+					  cntx  \
+					); \
+				} \
+				else \
+				{ \
+					/* Invoke the gemm micro-kernel. */ \
+					gemm_ukr \
+					( \
+					  k, \
+					  minus_one, \
+					  b1, \
+					  a1, \
+					  zero, \
+					  ct, cs_ct, rs_ct, \
+					  &aux, \
+					  cntx  \
+					); \
+\
+					/* Add the result to the edge of C. */ \
+					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+					                        ct,  rs_ct, cs_ct, \
+					                        alpha2_cast, \
+					                        c11, rs_c,  cs_c ); \
+				} \
+				} \
+\
+				a1  += rstep_a; \
+				c11 += rstep_c; \
+			} \
+\
+			b1 += cstep_b; \
+		} \
+\
+		c1 += cstep_c; \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
+
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index 344a07447..42ed83bc5 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -94,6 +95,60 @@ gint_t bli_info_get_enable_packbuf_pools( void )
 	return 0;
 #endif
 }
+gint_t bli_info_get_enable_threading( void )
+{
+	if ( bli_info_get_enable_openmp() ||
+	     bli_info_get_enable_pthreads() ) return 1;
+	else                                  return 0;
+}
+gint_t bli_info_get_enable_openmp( void )
+{
+#ifdef BLIS_ENABLE_OPENMP
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_enable_pthreads( void )
+{
+#ifdef BLIS_ENABLE_PTHREADS
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_thread_part_jrir_slab( void )
+{
+#ifdef BLIS_ENABLE_JRIR_SLAB
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_thread_part_jrir_rr( void )
+{
+#ifdef BLIS_ENABLE_JRIR_RR
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_enable_memkind( void )
+{
+#ifdef BLIS_ENABLE_MEMKIND
+	return 1;
+#else
+	return 0;
+#endif
+}
+gint_t bli_info_get_enable_sandbox( void )
+{
+#ifdef BLIS_ENABLE_SANDBOX
+	return 1;
+#else
+	return 0;
+#endif
+}
 
 
 
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 82ff86b03..96aeade85 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -58,6 +59,13 @@ gint_t bli_info_get_enable_blas( void );
 gint_t bli_info_get_enable_cblas( void );
 gint_t bli_info_get_blas_int_type_size( void );
 gint_t bli_info_get_enable_packbuf_pools( void );
+gint_t bli_info_get_enable_threading( void );
+gint_t bli_info_get_enable_openmp( void );
+gint_t bli_info_get_enable_pthreads( void );
+gint_t bli_info_get_thread_part_jrir_slab( void );
+gint_t bli_info_get_thread_part_jrir_rr( void );
+gint_t bli_info_get_enable_memkind( void );
+gint_t bli_info_get_enable_sandbox( void );
 
 
 // -- Kernel implementation-related --------------------------------------------
diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c
index 9b5803d9f..1f40933b0 100644
--- a/frame/base/bli_prune.c
+++ b/frame/base/bli_prune.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
 	// matrix is empty. This is not strictly needed but rather a minor
 	// optimization, as it would prevent threads that would otherwise get
 	// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
-	// because bli_thread_get_range*() would return empty ranges, which would
+	// because bli_thread_range*() would return empty ranges, which would
 	// cause the variant's for loop from executing any iterations.
 	// NOTE: this should only ever execute if the primary object is
 	// triangular because that is the only structure type with subpartitions
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index eb92f08b0..4d235700f 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n )
 	         !bli_is_strictly_below_diag_n( diagoff, m, n ) );
 }
 
+static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n )
+{
+	return ( bool_t )
+	       ( bli_is_strictly_above_diag_n( diagoff, m, n ) ||
+	         bli_is_strictly_below_diag_n( diagoff, m, n ) );
+}
+
 static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n )
 {
 	return ( bool_t )
@@ -784,10 +792,25 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
 	       ( i != 0 || n_left == 0 );
 }
 
-static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth )
+static bool_t bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
 {
 	return ( bool_t )
-	       ( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) );
+	       ( i == end_iter - 1 );
+}
+
+static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+{
+	return ( bool_t )
+	       ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
+}
+
+static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
+{
+#ifdef BLIS_ENABLE_JRIR_SLAB
+	return bli_is_last_iter_sl( i, end_iter, tid, nth );
+#else // BLIS_ENABLE_JRIR_RR
+	return bli_is_last_iter_rr( i, end_iter, tid, nth );
+#endif
 }
 
 
diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c
index bfe7e476f..3b1ef94ce 100644
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -230,8 +230,52 @@ void bli_l3_thread_decorator
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
 	{
+		dim_t      n_threads_real = omp_get_num_threads();
 		dim_t      id = omp_get_thread_num();
 
+		// Check if the number of OpenMP threads created within this parallel
+		// region is different from the number of threads that were requested
+		// of BLIS. This inequality may trigger when, for example, the
+		// following conditions are satisfied:
+		// - an application is executing an OpenMP parallel region in which
+		//   BLIS is invokved,
+		// - BLIS is configured for multithreading via OpenMP,
+		// - OMP_NUM_THREADS = t > 1,
+		// - the number of threads requested of BLIS (regardless of method)
+		//   is p <= t,
+		// - OpenMP nesting is disabled.
+		// In this situation, the application spawns t threads. Each application
+		// thread calls gemm (for example). Each gemm will attempt to spawn p
+		// threads via OpenMP. However, since nesting is disabled, the OpenMP
+		// implementation finds that t >= p threads are already spawned, and
+		// thus it doesn't spawn *any* additional threads for each gemm.
+		if ( n_threads_real != n_threads )
+		{
+ 			// If the number of threads active in the current region is not
+			// equal to the number requested of BLIS, we then only continue
+			// if the number of threads in the current region is 1. If, for
+			// example, BLIS requested 4 threads but only got 3, then we
+			// abort().
+			if ( id == 0 )
+			{
+				if ( n_threads_real != 1 )
+				{
+					bli_print_msg( "A different number of threads was "
+					               "created than was requested.",
+					               __FILE__, __LINE__ );
+					bli_abort();
+				}
+
+				n_threads = 1;
+				bli_thrcomm_init( gl_comm, 1 );
+				bli_rntm_set_num_threads_only( 1, rntm );
+				bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
+			}
+
+			// Synchronize all threads and continue.
+			_Pragma( "omp barrier" )
+		}
+
 		obj_t      a_t, b_t, c_t;
 		cntl_t*    cntl_use;
 		thrinfo_t* thread;
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 2931d0951..8b9f41b75 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -61,7 +61,7 @@ void bli_thread_finalize( void )
 
 // -----------------------------------------------------------------------------
 
-void bli_thread_get_range_sub
+void bli_thread_range_sub
      (
        thrinfo_t* thread,
        dim_t      n,
@@ -72,6 +72,9 @@ void bli_thread_get_range_sub
      )
 {
 	dim_t      n_way      = bli_thread_n_way( thread );
+
+	if ( n_way == 1 ) { *start = 0; *end = n; return; }
+
 	dim_t      work_id    = bli_thread_work_id( thread );
 
 	dim_t      all_start  = 0;
@@ -202,7 +205,7 @@ void bli_thread_get_range_sub
 	}
 }
 
-siz_t bli_thread_get_range_l2r
+siz_t bli_thread_range_l2r
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -216,13 +219,13 @@ siz_t bli_thread_get_range_l2r
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );
 
-	bli_thread_get_range_sub( thr, n, bf,
-	                          FALSE, start, end );
+	bli_thread_range_sub( thr, n, bf,
+	                      FALSE, start, end );
 
 	return m * ( *end - *start );
 }
 
-siz_t bli_thread_get_range_r2l
+siz_t bli_thread_range_r2l
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -236,13 +239,13 @@ siz_t bli_thread_get_range_r2l
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );
 
-	bli_thread_get_range_sub( thr, n, bf,
-	                          TRUE, start, end );
+	bli_thread_range_sub( thr, n, bf,
+	                      TRUE, start, end );
 
 	return m * ( *end - *start );
 }
 
-siz_t bli_thread_get_range_t2b
+siz_t bli_thread_range_t2b
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -256,13 +259,13 @@ siz_t bli_thread_get_range_t2b
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );
 
-	bli_thread_get_range_sub( thr, m, bf,
-	                          FALSE, start, end );
+	bli_thread_range_sub( thr, m, bf,
+	                      FALSE, start, end );
 
 	return n * ( *end - *start );
 }
 
-siz_t bli_thread_get_range_b2t
+siz_t bli_thread_range_b2t
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -276,15 +279,15 @@ siz_t bli_thread_get_range_b2t
 	dim_t n  = bli_obj_width_after_trans( a );
 	dim_t bf = bli_blksz_get_def( dt, bmult );
 
-	bli_thread_get_range_sub( thr, m, bf,
-	                          TRUE, start, end );
+	bli_thread_range_sub( thr, m, bf,
+	                      TRUE, start, end );
 
 	return n * ( *end - *start );
 }
 
 // -----------------------------------------------------------------------------
 
-dim_t bli_thread_get_range_width_l
+dim_t bli_thread_range_width_l
      (
        doff_t diagoff_j,
        dim_t  m,
@@ -495,17 +498,17 @@ siz_t bli_find_area_trap_l
 
 // -----------------------------------------------------------------------------
 
-siz_t bli_thread_get_range_weighted_sub
+siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* thread,
-       doff_t     diagoff,
-       uplo_t     uplo,
-       dim_t      m,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     j_start_thr,
-       dim_t*     j_end_thr
+       thrinfo_t* restrict thread,
+       doff_t              diagoff,
+       uplo_t              uplo,
+       dim_t               m,
+       dim_t               n,
+       dim_t               bf,
+       bool_t              handle_edge_low,
+       dim_t*     restrict j_start_thr,
+       dim_t*     restrict j_end_thr
      )
 {
 	dim_t      n_way   = bli_thread_n_way( thread );
@@ -570,7 +573,7 @@ siz_t bli_thread_get_range_weighted_sub
 			// Compute the width of the jth subpartition, taking the
 			// current diagonal offset into account, if needed.
 			width_j =
-			bli_thread_get_range_width_l
+			bli_thread_range_width_l
 			(
 			  diagoff_j, m, n_left,
 			  j, n_way,
@@ -614,7 +617,7 @@ siz_t bli_thread_get_range_weighted_sub
 		bli_toggle_bool( &handle_edge_low );
 
 		// Compute the appropriate range for the rotated trapezoid.
-		area = bli_thread_get_range_weighted_sub
+		area = bli_thread_range_weighted_sub
 		(
 		  thread, diagoff, uplo, m, n, bf,
 		  handle_edge_low,
@@ -632,7 +635,7 @@ siz_t bli_thread_get_range_weighted_sub
 	return area;
 }
 
-siz_t bli_thread_get_range_mdim
+siz_t bli_thread_range_mdim
      (
        dir_t      direct,
        thrinfo_t* thr,
@@ -678,20 +681,20 @@ siz_t bli_thread_get_range_mdim
 	if ( use_weighted )
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
 	}
 	else
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_t2b( thr, x, bmult, start, end );
+			return bli_thread_range_t2b( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_b2t( thr, x, bmult, start, end );
+			return bli_thread_range_b2t( thr, x, bmult, start, end );
 	}
 }
 
-siz_t bli_thread_get_range_ndim
+siz_t bli_thread_range_ndim
      (
        dir_t      direct,
        thrinfo_t* thr,
@@ -737,20 +740,20 @@ siz_t bli_thread_get_range_ndim
 	if ( use_weighted )
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end );
+			return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
 	}
 	else
 	{
 		if ( direct == BLIS_FWD )
-			return bli_thread_get_range_l2r( thr, x, bmult, start, end );
+			return bli_thread_range_l2r( thr, x, bmult, start, end );
 		else
-			return bli_thread_get_range_r2l( thr, x, bmult, start, end );
+			return bli_thread_range_r2l( thr, x, bmult, start, end );
 	}
 }
 
-siz_t bli_thread_get_range_weighted_l2r
+siz_t bli_thread_range_weighted_l2r
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -782,7 +785,7 @@ siz_t bli_thread_get_range_weighted_l2r
 		}
 
 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  FALSE, start, end
@@ -790,7 +793,7 @@ siz_t bli_thread_get_range_weighted_l2r
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_l2r
+		area = bli_thread_range_l2r
 		(
 		  thr, a, bmult,
 		  start, end
@@ -800,7 +803,7 @@ siz_t bli_thread_get_range_weighted_l2r
 	return area;
 }
 
-siz_t bli_thread_get_range_weighted_r2l
+siz_t bli_thread_range_weighted_r2l
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -834,7 +837,7 @@ siz_t bli_thread_get_range_weighted_r2l
 		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
 
 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  TRUE, start, end
@@ -842,7 +845,7 @@ siz_t bli_thread_get_range_weighted_r2l
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_r2l
+		area = bli_thread_range_r2l
 		(
 		  thr, a, bmult,
 		  start, end
@@ -852,7 +855,7 @@ siz_t bli_thread_get_range_weighted_r2l
 	return area;
 }
 
-siz_t bli_thread_get_range_weighted_t2b
+siz_t bli_thread_range_weighted_t2b
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -886,7 +889,7 @@ siz_t bli_thread_get_range_weighted_t2b
 		bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
 
 		area =
-		bli_thread_get_range_weighted_sub
+		bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  FALSE, start, end
@@ -894,7 +897,7 @@ siz_t bli_thread_get_range_weighted_t2b
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_t2b
+		area = bli_thread_range_t2b
 		(
 		  thr, a, bmult,
 		  start, end
@@ -904,7 +907,7 @@ siz_t bli_thread_get_range_weighted_t2b
 	return area;
 }
 
-siz_t bli_thread_get_range_weighted_b2t
+siz_t bli_thread_range_weighted_b2t
      (
        thrinfo_t* thr,
        obj_t*     a,
@@ -939,7 +942,7 @@ siz_t bli_thread_get_range_weighted_b2t
 
 		bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
 
-		area = bli_thread_get_range_weighted_sub
+		area = bli_thread_range_weighted_sub
 		(
 		  thr, diagoff, uplo, m, n, bf,
 		  TRUE, start, end
@@ -947,7 +950,7 @@ siz_t bli_thread_get_range_weighted_b2t
 	}
 	else // if dense or zeros
 	{
-		area = bli_thread_get_range_b2t
+		area = bli_thread_range_b2t
 		(
 		  thr, a, bmult,
 		  start, end
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 20c70a8f5..8dff32141 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -6,6 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,7 +57,8 @@ void bli_thread_finalize( void );
 #endif
 
 // Thread range-related prototypes.
-void bli_thread_get_range_sub
+
+void bli_thread_range_sub
      (
        thrinfo_t* thread,
        dim_t      n,
@@ -82,8 +84,8 @@ siz_t PASTEMAC0( opname ) \
        dim_t*     end  \
      );
 
-GENPROT( thread_get_range_mdim )
-GENPROT( thread_get_range_ndim )
+GENPROT( thread_range_mdim )
+GENPROT( thread_range_ndim )
 
 #undef  GENPROT
 #define GENPROT( opname ) \
@@ -97,18 +99,18 @@ siz_t PASTEMAC0( opname ) \
        dim_t*     end  \
      );
 
-GENPROT( thread_get_range_l2r )
-GENPROT( thread_get_range_r2l )
-GENPROT( thread_get_range_t2b )
-GENPROT( thread_get_range_b2t )
+GENPROT( thread_range_l2r )
+GENPROT( thread_range_r2l )
+GENPROT( thread_range_t2b )
+GENPROT( thread_range_b2t )
 
-GENPROT( thread_get_range_weighted_l2r )
-GENPROT( thread_get_range_weighted_r2l )
-GENPROT( thread_get_range_weighted_t2b )
-GENPROT( thread_get_range_weighted_b2t )
+GENPROT( thread_range_weighted_l2r )
+GENPROT( thread_range_weighted_r2l )
+GENPROT( thread_range_weighted_t2b )
+GENPROT( thread_range_weighted_b2t )
 
 
-dim_t bli_thread_get_range_width_l
+dim_t bli_thread_range_width_l
      (
        doff_t diagoff_j,
        dim_t  m,
@@ -126,17 +128,17 @@ siz_t bli_find_area_trap_l
        dim_t  n,
        doff_t diagoff
      );
-siz_t bli_thread_get_range_weighted_sub
+siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* thread,
-       doff_t     diagoff,
-       uplo_t     uplo,
-       dim_t      m,
-       dim_t      n,
-       dim_t      bf,
-       bool_t     handle_edge_low,
-       dim_t*     j_start_thr,
-       dim_t*     j_end_thr
+       thrinfo_t* restrict thread,
+       doff_t              diagoff,
+       uplo_t              uplo,
+       dim_t               m,
+       dim_t               n,
+       dim_t               bf,
+       bool_t              handle_edge_low,
+       dim_t*     restrict j_start_thr,
+       dim_t*     restrict j_end_thr
      );
 
 
@@ -211,5 +213,98 @@ void  bli_thread_init_rntm( rntm_t* rntm );
 
 void  bli_thread_init_rntm_from_env( rntm_t* rntm );
 
+// -----------------------------------------------------------------------------
+
+static void bli_thread_range_jrir_rr
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+       dim_t*     inc
+     )
+{
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;
+}
+
+static void bli_thread_range_jrir_sl
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+       dim_t*     inc
+     )
+{
+	// Use contiguous slab partitioning of jr/ir loops.
+	bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
+	*inc = 1;
+}
+
+static void bli_thread_range_jrir
+     (
+       thrinfo_t* thread,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+       dim_t*     inc
+     )
+{
+	// Define a general-purpose version of bli_thread_range_jrir() whose
+	// definition depends on whether slab or round-robin partitioning was
+	// requested at configure-time.
+#ifdef BLIS_ENABLE_JRIR_SLAB
+	bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
+#else
+	bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
+#endif
+}
+
+#if 0
+static void bli_thread_range_weighted_jrir
+     (
+       thrinfo_t* thread,
+       doff_t     diagoff,
+       uplo_t     uplo,
+       dim_t      m,
+       dim_t      n,
+       dim_t      bf,
+       bool_t     handle_edge_low,
+       dim_t*     start,
+       dim_t*     end,
+       dim_t*     inc
+     )
+{
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+	// Use contiguous slab partitioning for jr/ir loops.
+	bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
+	                               handle_edge_low, start, end );
+
+	*start = *start / bf; *inc = 1;
+
+	if ( *end % bf ) *end = *end / bf + 1;
+	else             *end = *end / bf;
+
+#else
+
+	// Use interleaved partitioning of jr/ir loops.
+	*start = bli_thread_work_id( thread );
+	*inc   = bli_thread_n_way( thread );
+	*end   = n;
+
+#endif
+}
+#endif
+
 #endif
 
diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c
index 3e8e930de..cd2d4bce8 100644
--- a/kernels/bgq/1/bli_dotv_bgq_int.c
+++ b/kernels/bgq/1/bli_dotv_bgq_int.c
@@ -34,8 +34,8 @@
 
 #include "blis.h"
 
-void bli_ddotv_bgq_int 
-     ( 
+void bli_ddotv_bgq_int
+     (
        conj_t           conjx,
        conj_t           conjy,
        dim_t            n,
@@ -44,14 +44,14 @@ void bli_ddotv_bgq_int
        double* restrict rho,
        cntx_t* restrict cntx
      )
-{ 
+{
 	bool_t use_ref = FALSE;
 
 	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) ) {
-		PASTEMAC(d,set0s)( rho ); 
-		return; 
-	} 
+		PASTEMAC(d,set0s)( *rho );
+		return;
+	}
 	// If there is anything that would interfere with our use of aligned
 	// vector loads/stores, call the reference implementation.
 	if ( incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )x, 32 ) || bli_is_unaligned_to( ( siz_t )y, 32 ) )
@@ -64,7 +64,7 @@ void bli_ddotv_bgq_int
 
 	dim_t n_run       = n / 4;
 	dim_t n_left      = n % 4;
-    
+
     double rhos = 0.0;
     #pragma omp parallel reduction(+:rhos)
     {
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 95b5841e9..3b5cecd39 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -39,15 +39,15 @@
 
 
 /*
- * Here is dgemm kernel for QPX. 
+ * Here is dgemm kernel for QPX.
  * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread:
  * "Achievable peak can either be:
  * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz
  * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread
  * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread"
  *
- * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. 
- * I know the kernel John uses is 8x8, so 16 flops per loop iteration. 
+ * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle.
+ * I know the kernel John uses is 8x8, so 16 flops per loop iteration.
  * Thus there must be 24 total instructions per iteration because 16/24 = 5.33.
  *
  * Here, we have 6 loads per iteration. These are executed on a different pipeline from FMAs so
@@ -56,23 +56,16 @@
 
 void bli_dgemm_bgq_int_8x8
      (
-       dim_t               k0,
+       dim_t               k,
        double*    restrict alpha,
        double*    restrict a,
        double*    restrict b,
        double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       double*    restrict c, inc_t rs_c, inc_t cs_c,
        auxinfo_t* restrict data,
        cntx_t*    restrict cntx
      )
 {
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
     //Registers for storing C.
     //4 4x4 subblocks of C, c00, c01, c10, c11
     //4 registers per subblock: a, b, c, d
@@ -110,7 +103,7 @@ void bli_dgemm_bgq_int_8x8
 
         a0  = vec_lda ( 0 * sizeof(double), &a[8*i] );
         a1  = vec_lda ( 4 * sizeof(double), &a[8*i] );
-        
+
         c00a    = vec_xmadd ( b0a, a0, c00a );
         c00b    = vec_xxmadd( a0, b0a, c00b );
         c00c    = vec_xmadd ( b0b, a0, c00c );
@@ -131,7 +124,7 @@ void bli_dgemm_bgq_int_8x8
         c11c    = vec_xmadd ( b1b, a1, c11c );
         c11d    = vec_xxmadd( a1, b1b, c11d );
     }
-    
+
     // Create patterns for permuting Cb and Cd
     vector4double pattern = vec_gpci( 01032 );
 
@@ -140,7 +133,7 @@ void bli_dgemm_bgq_int_8x8
     vector4double betav  = vec_lds( 0, ( double* )beta );
     vector4double alphav = vec_lds( 0, ( double* )alpha );
     double ct;
-  
+
     //Macro to update 4 elements of C in a column.
     //REG is the register holding those 4 elements
     //ADDR is the address to write them to
@@ -167,7 +160,7 @@ void bli_dgemm_bgq_int_8x8
     *(ADDR + (OFFSET + 2) * rs_c) = ct; \
     ct = vec_extract( AB, 3 );          \
     *(ADDR + (OFFSET + 3) * rs_c) = ct; \
-}  
+}
     //Update c00 and c10 sub-blocks
     UPDATE( c00a, c, 0 );
     UPDATE( c10a, c, 4 );
@@ -263,7 +256,7 @@ void bli_zgemm_bgq_int_4x4
 
     for( dim_t i = 0; i < k; i++ )
     {
-        
+
         b0 = vec_ld2a( 0 * sizeof(double), &b_d[8*i] );
         b1 = vec_ld2a( 2 * sizeof(double), &b_d[8*i] );
         b2 = vec_ld2a( 4 * sizeof(double), &b_d[8*i] );
@@ -271,7 +264,7 @@ void bli_zgemm_bgq_int_4x4
 
         a0 = vec_lda ( 0 * sizeof(double), &a_d[8*i] );
         a1 = vec_lda ( 4 * sizeof(double), &a_d[8*i] );
-        
+
         c00a    = vec_xmadd ( b0, a0, c00a );
         c00b    = vec_xxcpnmadd( a0, b0, c00b );
         c01a    = vec_xmadd ( b1, a0, c01a );
@@ -308,7 +301,7 @@ void bli_zgemm_bgq_int_4x4
     double alphai = bli_zimag( *alpha );
     double betar  = bli_zreal( *beta );
     double betai  = bli_zimag( *beta );
-    vector4double alphav = vec_splats( 0.0 ); 
+    vector4double alphav = vec_splats( 0.0 );
     vector4double betav = vec_splats( 0.0 );
     alphav = vec_insert( alphar, alphav, 0);
     alphav = vec_insert( alphai, alphav, 1);
@@ -319,7 +312,7 @@ void bli_zgemm_bgq_int_4x4
     betav = vec_insert( betar, betav, 2);
     betav = vec_insert( betai, betav, 3);
     double ct;
-  
+
 
     //Macro to update 2 elements of C in a column.
     //REG1 is the register holding the first partial sum of those 2 elements
diff --git a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
similarity index 99%
rename from kernels/zen/3/bli_gemm_zen_asm_d6x8.c
rename to kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 463155581..60073c501 100644
--- a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -77,7 +77,7 @@
 	vpermilps(imm(0x39), xmm2, xmm1) \
 	vmovss(xmm1, mem(rcx, r10, 1))
 
-void bli_sgemm_zen_asm_6x16
+void bli_sgemm_haswell_asm_6x16
      (
        dim_t               k0,
        float*     restrict alpha,
@@ -923,7 +923,7 @@ void bli_sgemm_zen_asm_6x16
 	vmovlpd(xmm1, mem(rcx, r13, 2)) \
 	vmovhpd(xmm1, mem(rcx, r10, 1))*/
 
-void bli_dgemm_zen_asm_6x8
+void bli_dgemm_haswell_asm_6x8
      (
        dim_t               k0,
        double*    restrict alpha,
@@ -1669,7 +1669,7 @@ void bli_dgemm_zen_asm_6x8
 #define CGEMM_OUTPUT_RS \
 	vmovups(ymm0, mem(rcx)) \
 
-void bli_cgemm_zen_asm_3x8
+void bli_cgemm_haswell_asm_3x8
      (
        dim_t               k0,
        scomplex*  restrict alpha,
@@ -2197,7 +2197,7 @@ void bli_cgemm_zen_asm_3x8
 #define ZGEMM_OUTPUT_RS \
 	vmovupd(ymm0, mem(rcx)) \
 
-void bli_zgemm_zen_asm_3x4
+void bli_zgemm_haswell_asm_3x4
      (
        dim_t               k0,
        dcomplex*  restrict alpha,
diff --git a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
similarity index 99%
rename from kernels/zen/3/bli_gemm_zen_asm_d8x6.c
rename to kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index 830cbec59..098d79d75 100644
--- a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -76,7 +76,7 @@
 	vpermilps(imm(0x39), xmm2, xmm1) \
 	vmovss(xmm1, mem(rcx, r10, 1))
 
-void bli_sgemm_zen_asm_16x6
+void bli_sgemm_haswell_asm_16x6
      (
        dim_t               k0,
        float*     restrict alpha,
@@ -662,7 +662,7 @@ void bli_sgemm_zen_asm_16x6
 	vmovlpd(xmm1, mem(rcx, r13, 2)) \
 	vmovhpd(xmm1, mem(rcx, r10, 1))*/
 
-void bli_dgemm_zen_asm_8x6
+void bli_dgemm_haswell_asm_8x6
      (
        dim_t               k0,
        double*    restrict alpha,
@@ -1257,7 +1257,7 @@ void bli_dgemm_zen_asm_8x6
 #define CGEMM_OUTPUT_CS \
 	vmovups(ymm0, mem(rcx)) \
 
-void bli_cgemm_zen_asm_8x3
+void bli_cgemm_haswell_asm_8x3
      (
        dim_t               k0,
        scomplex*  restrict alpha,
@@ -1785,7 +1785,7 @@ void bli_cgemm_zen_asm_8x3
 #define ZGEMM_OUTPUT_CS \
 	vmovupd(ymm0, mem(rcx)) \
 
-void bli_zgemm_zen_asm_4x3
+void bli_zgemm_haswell_asm_4x3
      (
        dim_t               k0,
        dcomplex*  restrict alpha,
diff --git a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
similarity index 99%
rename from kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
rename to kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index 288dd2fed..1a2e4a012 100644
--- a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -56,7 +56,7 @@
 	vmovss(xmm1, mem(rcx, r10, 1))
 
 
-void bli_sgemmtrsm_l_zen_asm_6x16
+void bli_sgemmtrsm_l_haswell_asm_6x16
      (
        dim_t               k0,
        float*     restrict alpha,
@@ -810,7 +810,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
 	vmovlpd(xmm1, mem(rcx, r13, 2)) \
 	vmovhpd(xmm1, mem(rcx, r10, 1))*/
 
-void bli_dgemmtrsm_l_zen_asm_6x8
+void bli_dgemmtrsm_l_haswell_asm_6x8
 (
     dim_t               k0,
     double*    restrict alpha,
diff --git a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
similarity index 99%
rename from kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
rename to kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 748769cb3..2ac286e8d 100644
--- a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -56,7 +56,7 @@
 	vmovss(xmm1, mem(rcx, r10, 1))
 
 
-void bli_sgemmtrsm_u_zen_asm_6x16
+void bli_sgemmtrsm_u_haswell_asm_6x16
      (
        dim_t               k0,
        float*     restrict alpha,
@@ -814,7 +814,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
 	vmovlpd(xmm1, mem(rcx, r13, 2)) \
 	vmovhpd(xmm1, mem(rcx, r10, 1))*/
 
-void bli_dgemmtrsm_u_zen_asm_6x8
+void bli_dgemmtrsm_u_haswell_asm_6x8
 (
     dim_t               k0,
     double*    restrict alpha,
diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h
index c8f0f1653..85670345d 100644
--- a/kernels/haswell/bli_kernels_haswell.h
+++ b/kernels/haswell/bli_kernels_haswell.h
@@ -32,23 +32,32 @@
 
 */
 
-// d12x4
-GEMM_UKR_PROT( float,    s, gemm_haswell_asm_24x4 )
-GEMM_UKR_PROT( double,   d, gemm_haswell_asm_12x4 )
+// -- level-3 --
 
-// d4x12
-GEMM_UKR_PROT( float,    s, gemm_haswell_asm_4x24 )
-GEMM_UKR_PROT( double,   d, gemm_haswell_asm_4x12 )
-
-// d6x8
+// gemm (asm d6x8)
 GEMM_UKR_PROT( float,    s, gemm_haswell_asm_6x16 )
 GEMM_UKR_PROT( double,   d, gemm_haswell_asm_6x8 )
 GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 )
 GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 )
 
-// d8x6
+// gemm (asm d8x6)
 GEMM_UKR_PROT( float,    s, gemm_haswell_asm_16x6 )
 GEMM_UKR_PROT( double,   d, gemm_haswell_asm_8x6 )
 GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )
 GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 )
 
+// gemmtrsm_l (asm d6x8)
+GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_l_haswell_asm_6x16 )
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_haswell_asm_6x8 )
+
+// gemmtrsm_u (asm d6x8)
+GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_u_haswell_asm_6x16 )
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_haswell_asm_6x8 )
+
+
+// gemm (asm d8x6)
+//GEMM_UKR_PROT( float,    s, gemm_haswell_asm_16x6 )
+//GEMM_UKR_PROT( double,   d, gemm_haswell_asm_8x6 )
+//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )
+//GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 )
+
diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h
index 119771436..842989a5a 100644
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -76,32 +76,3 @@ AXPYF_KER_PROT( double,   d, axpyf_zen_int_8 )
 DOTXF_KER_PROT( float,    s, dotxf_zen_int_8 )
 DOTXF_KER_PROT( double,   d, dotxf_zen_int_8 )
 
-// -- level-3 --
-
-// gemm (asm d6x8)
-GEMM_UKR_PROT( float,    s, gemm_zen_asm_6x16 )
-GEMM_UKR_PROT( double,   d, gemm_zen_asm_6x8 )
-GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_3x8 )
-GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_3x4 )
-
-// gemm (asm d8x6)
-GEMM_UKR_PROT( float,    s, gemm_zen_asm_16x6 )
-GEMM_UKR_PROT( double,   d, gemm_zen_asm_8x6 )
-GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 )
-GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 )
-
-// gemmtrsm_l (asm d6x8)
-GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_l_zen_asm_6x16 )
-GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen_asm_6x8 )
-
-// gemmtrsm_u (asm d6x8)
-GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_u_zen_asm_6x16 )
-GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_6x8 )
-
-
-// gemm (asm d8x6)
-//GEMM_UKR_PROT( float,    s, gemm_zen_asm_16x6 )
-//GEMM_UKR_PROT( double,   d, gemm_zen_asm_8x6 )
-//GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 )
-//GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 )
-
diff --git a/sandbox/ref99/blx_gemm_int.c b/sandbox/ref99/blx_gemm_int.c
index 4937095a9..febb8040a 100644
--- a/sandbox/ref99/blx_gemm_int.c
+++ b/sandbox/ref99/blx_gemm_int.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -46,10 +47,10 @@ void blx_gemm_int
        thrinfo_t* thread
      )
 {
-	obj_t     a_local;
-	obj_t     b_local;
-	obj_t     c_local;
-	gemm_voft f;
+	obj_t        a_local;
+	obj_t        b_local;
+	obj_t        c_local;
+	gemm_var_oft f;
 
 	// Alias A, B, and C in case we need to update attached scalars.
 	bli_obj_alias_to( a, &a_local );
diff --git a/sandbox/ref99/cntl/blx_gemm_cntl.c b/sandbox/ref99/cntl/blx_gemm_cntl.c
index ebcf6da30..d182296fa 100644
--- a/sandbox/ref99/cntl/blx_gemm_cntl.c
+++ b/sandbox/ref99/cntl/blx_gemm_cntl.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -54,7 +55,14 @@ cntl_t* blx_gemmbp_cntl_create
        pack_t schema_b
      )
 {
-	void* macro_kernel_p = blx_gemm_ker_var2;
+	void* macro_kernel_fp;
+	void* packa_fp;
+	void* packb_fp;
+
+	macro_kernel_fp = blx_gemm_ker_var2;
+
+	packa_fp = bli_packm_blk_var1;
+	packb_fp = bli_packm_blk_var1;
 
 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
@@ -69,7 +77,7 @@ cntl_t* blx_gemmbp_cntl_create
 	(
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
+	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);
 
@@ -77,7 +85,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -101,7 +109,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_KR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
diff --git a/sandbox/ref99/vars/blx_gemm_blk_var1.c b/sandbox/ref99/vars/blx_gemm_blk_var1.c
index 43eb40bef..70482ede1 100644
--- a/sandbox/ref99/vars/blx_gemm_blk_var1.c
+++ b/sandbox/ref99/vars/blx_gemm_blk_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var1
 	dim_t my_start, my_end;
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_mdim
+	bli_thread_range_mdim
 	(
 	  BLIS_FWD, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
diff --git a/sandbox/ref99/vars/blx_gemm_blk_var2.c b/sandbox/ref99/vars/blx_gemm_blk_var2.c
index debcb2dfc..00a19ceef 100644
--- a/sandbox/ref99/vars/blx_gemm_blk_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_blk_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var2
 	dim_t my_start, my_end;
 
 	// Determine the current thread's subpartition range.
-	bli_thread_get_range_ndim
+	bli_thread_range_ndim
 	(
 	  BLIS_FWD, thread, a, b, c, cntl, cntx,
 	  &my_start, &my_end
diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c
index c780489e9..21282a3f5 100644
--- a/sandbox/ref99/vars/blx_gemm_ker_var2.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -256,16 +257,31 @@ void PASTECH2(blx_,ch,varname) \
 	bli_auxinfo_set_is_b( is_b, &aux ); \
 \
 	/* Save the desired output datatype (indicating no typecasting). */ \
-	bli_auxinfo_set_dt_on_output( dt, &aux ); \
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
 \
-	thrinfo_t* caucus    = bli_thrinfo_sub_node( thread ); \
-	dim_t jr_num_threads = bli_thread_n_way( thread ); \
-	dim_t jr_thread_id   = bli_thread_work_id( thread ); \
-	dim_t ir_num_threads = bli_thread_n_way( caucus ); \
-	dim_t ir_thread_id   = bli_thread_work_id( caucus ); \
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for the 2nd and 1st loops.
+	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
+	   slab or round-robin partitioning was requested at configure-time. */ \
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
 	{ \
 		ctype* restrict a1; \
 		ctype* restrict c11; \
@@ -280,7 +296,7 @@ void PASTECH2(blx_,ch,varname) \
 		b2 = b1; \
 \
 		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
 		{ \
 			ctype* restrict a2; \
 \
@@ -290,12 +306,12 @@ void PASTECH2(blx_,ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
-			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
-				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
diff --git a/sandbox/ref99/vars/blx_gemm_var.h b/sandbox/ref99/vars/blx_gemm_var.h
index 22911eda2..b434ea60a 100644
--- a/sandbox/ref99/vars/blx_gemm_var.h
+++ b/sandbox/ref99/vars/blx_gemm_var.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
diff --git a/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c b/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c
new file mode 100644
index 000000000..eff1ecc85
--- /dev/null
+++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c
@@ -0,0 +1,373 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+// Function pointer type for datatype-specific functions.
+typedef void (*gemm_fp)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+// Function pointer array for datatype-specific functions.
+static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
+{
+    PASTECH2(blx_,s,gemm_ker_var2rr),
+    PASTECH2(blx_,c,gemm_ker_var2rr),
+    PASTECH2(blx_,d,gemm_ker_var2rr),
+    PASTECH2(blx_,z,gemm_ker_var2rr)
+};
+
+
+void blx_gemm_ker_var2rr
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	gemm_fp   f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(blx_,ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+#if 0
+GENTFUNC( float,    s, gemm_ker_var2rr )
+GENTFUNC( double,   d, gemm_ker_var2rr )
+GENTFUNC( scomplex, c, gemm_ker_var2rr )
+GENTFUNC( dcomplex, z, gemm_ker_var2rr )
+#else
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
+#endif
+
diff --git a/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c b/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c
new file mode 100644
index 000000000..31f51df92
--- /dev/null
+++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c
@@ -0,0 +1,373 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+// Function pointer type for datatype-specific functions.
+typedef void (*gemm_fp)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+// Function pointer array for datatype-specific functions.
+static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
+{
+    PASTECH2(blx_,s,gemm_ker_var2sl),
+    PASTECH2(blx_,c,gemm_ker_var2sl),
+    PASTECH2(blx_,d,gemm_ker_var2sl),
+    PASTECH2(blx_,z,gemm_ker_var2sl)
+};
+
+
+void blx_gemm_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	gemm_fp   f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(blx_,ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+#if 0
+GENTFUNC( float,    s, gemm_ker_var2sl )
+GENTFUNC( double,   d, gemm_ker_var2sl )
+GENTFUNC( scomplex, c, gemm_ker_var2sl )
+GENTFUNC( dcomplex, z, gemm_ker_var2sl )
+#else
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
+#endif
+
diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile
index 3dcd6d435..e91b100b2 100644
--- a/test/3m4m/Makefile
+++ b/test/3m4m/Makefile
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -200,13 +201,13 @@ STR_ST   := -DTHR_STR=\"st\"
 STR_MT   := -DTHR_STR=\"mt\"
 
 # Problem size specification
-PDEF_ST  := -DP_BEGIN=100 \
+PDEF_ST  := -DP_BEGIN=96 \
             -DP_END=2000 \
-            -DP_INC=100
+            -DP_INC=96
 
-PDEF_MT  := -DP_BEGIN=200 \
-            -DP_END=10000 \
-            -DP_INC=200
+PDEF_MT  := -DP_BEGIN=192 \
+            -DP_END=3000 \
+            -DP_INC=192
 
 
 
@@ -226,9 +227,6 @@ all-mt:       blis-mt openblas-mt mkl-mt
 blis-st:      blis-gemm-st
 blis-mt:      blis-gemm-mt
 
-blis-nat-st:  blis-gemm-nat-st
-blis-nat-mt:  blis-gemm-nat-mt
-
 openblas-st:  openblas-gemm-st
 openblas-mt:  openblas-gemm-mt
 
@@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \
 blis-gemm-mt: blis-gemm-nat-mt \
               blis-gemm-ind-mt
 
+blis-nat-st: \
+      test_sgemm_asm_blis_st.x \
+      test_dgemm_asm_blis_st.x \
+      test_cgemm_asm_blis_st.x \
+      test_zgemm_asm_blis_st.x \
+      test_sherk_asm_blis_st.x \
+      test_dherk_asm_blis_st.x \
+      test_cherk_asm_blis_st.x \
+      test_zherk_asm_blis_st.x \
+      test_strmm_asm_blis_st.x \
+      test_dtrmm_asm_blis_st.x \
+      test_ctrmm_asm_blis_st.x \
+      test_ztrmm_asm_blis_st.x \
+      test_strsm_asm_blis_st.x \
+      test_dtrsm_asm_blis_st.x \
+      test_ctrsm_asm_blis_st.x \
+      test_ztrsm_asm_blis_st.x
+
+blis-nat-mt: \
+      test_sgemm_asm_blis_mt.x \
+      test_dgemm_asm_blis_mt.x \
+      test_cgemm_asm_blis_mt.x \
+      test_zgemm_asm_blis_mt.x \
+      test_sherk_asm_blis_mt.x \
+      test_dherk_asm_blis_mt.x \
+      test_cherk_asm_blis_mt.x \
+      test_zherk_asm_blis_mt.x \
+      test_strmm_asm_blis_mt.x \
+      test_dtrmm_asm_blis_mt.x \
+      test_ctrmm_asm_blis_mt.x \
+      test_ztrmm_asm_blis_mt.x \
+      test_strsm_asm_blis_mt.x \
+      test_dtrsm_asm_blis_mt.x \
+      test_ctrsm_asm_blis_mt.x \
+      test_ztrsm_asm_blis_mt.x
+
 blis-gemm-nat-st: \
       test_sgemm_asm_blis_st.x \
       test_dgemm_asm_blis_st.x \
@@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@
 
 # blis asm
-test_d%_asm_blis_st.o: test_%.c
+test_d%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@
 
-test_s%_asm_blis_st.o: test_%.c
+test_s%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@
 
-test_z%_asm_blis_st.o: test_%.c
+test_z%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@
 
-test_c%_asm_blis_st.o: test_%.c
+test_c%_asm_blis_st.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_ST) -c $< -o $@
 
-test_d%_asm_blis_mt.o: test_%.c
+test_d%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@
 
-test_s%_asm_blis_mt.o: test_%.c
+test_s%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@
 
-test_z%_asm_blis_mt.o: test_%.c
+test_z%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@
 
-test_c%_asm_blis_mt.o: test_%.c
+test_c%_asm_blis_mt.o: test_%.c Makefile
 	$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT)  $(STR_NAT)  $(STR_MT) -c $< -o $@
 
 # openblas
diff --git a/test/3m4m/test_herk.c b/test/3m4m/test_herk.c
new file mode 100644
index 000000000..66a057a59
--- /dev/null
+++ b/test/3m4m/test_herk.c
@@ -0,0 +1,314 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c;
+	obj_t    c_save;
+	obj_t    alpha, beta;
+	dim_t    m, k;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, k_input;
+	ind_t    ind;
+	num_t    dt, dt_real;
+	char     dt_ch;
+	int      r, n_repeats;
+	uplo_t   uploc;
+	trans_t  transa;
+	f77_char f77_uploc;
+	f77_char f77_transa;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+	dt_real = bli_dt_proj_to_real( DT );
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	k_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+	uploc  = BLIS_LOWER;
+	transa = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%cherk_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt_real, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt,      1, 1, 0, 0, &beta );
+
+		if ( bli_does_trans( transa ) )
+			bli_obj_create( dt, k, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, m, m, 0, 0, &c );
+		//bli_obj_create( dt, m, k, 2, 2*m, &a );
+		//bli_obj_create( dt, k, n, 2, 2*k, &b );
+		//bli_obj_create( dt, m, n, 2, 2*m, &c );
+		bli_obj_create( dt, m, m, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_HERMITIAN, &c );
+		bli_obj_set_uplo( uploc, &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &beta );
+
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_herk( &alpha,
+			          &a,
+			          &beta,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = bli_obj_buffer( &alpha );
+			float*   ap     = bli_obj_buffer( &a );
+			float*   betap  = bli_obj_buffer( &beta );
+			float*   cp     = bli_obj_buffer( &c );
+
+			ssyrk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+			double*  betap  = bli_obj_buffer( &beta );
+			double*  cp     = bli_obj_buffer( &c );
+
+			dsyrk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width_after_trans( &a );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			float*     alphap = bli_obj_buffer( &alpha );
+			scomplex*  ap     = bli_obj_buffer( &a );
+			scomplex*  betap  = bli_obj_buffer( &beta );
+			scomplex*  cp     = bli_obj_buffer( &c );
+
+			cherk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width_after_trans( &a );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			double*    alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  betap  = bli_obj_buffer( &beta );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			zherk_( &f77_uploc,
+			        &f77_transa,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        betap,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%cherk_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k, gflops );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
diff --git a/test/3m4m/test_trmm.c b/test/3m4m/test_trmm.c
new file mode 100644
index 000000000..06ed38539
--- /dev/null
+++ b/test/3m4m/test_trmm.c
@@ -0,0 +1,328 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c;
+	obj_t    c_save;
+	obj_t    alpha;
+	dim_t    m, n;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, n_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	side_t   side;
+	uplo_t   uploa;
+	trans_t  transa;
+	diag_t   diaga;
+	f77_char f77_side;
+	f77_char f77_uploa;
+	f77_char f77_transa;
+	f77_char f77_diaga;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+#if 0
+	side   = BLIS_LEFT;
+#else
+	side   = BLIS_RIGHT;
+#endif
+#if 0
+	uploa  = BLIS_LOWER;
+#else
+	uploa  = BLIS_UPPER;
+#endif
+	transa = BLIS_NO_TRANSPOSE;
+	diaga  = BLIS_NONUNIT_DIAG;
+
+	bli_param_map_blis_to_netlib_side( side, &f77_side );
+	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%ctrmm_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &a );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
+		bli_obj_set_uplo( uploa, &a );
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_diag( diaga, &a );
+
+		bli_randm( &a );
+		bli_mktrim( &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_trmm( side,
+			          &alpha,
+			          &a,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			float*    alphap = bli_obj_buffer( &alpha );
+			float*    ap     = bli_obj_buffer( &a );
+			float*    cp     = bli_obj_buffer( &c );
+
+			strmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			double*   alphap = bli_obj_buffer( &alpha );
+			double*   ap     = bli_obj_buffer( &a );
+			double*   cp     = bli_obj_buffer( &c );
+
+			dtrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* cp     = bli_obj_buffer( &c );
+
+			ctrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width( &c );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			ztrmm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		if ( bli_is_left( side ) )
+			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
+		else
+			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%ctrmm_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
diff --git a/test/3m4m/test_trsm.c b/test/3m4m/test_trsm.c
new file mode 100644
index 000000000..f417a5361
--- /dev/null
+++ b/test/3m4m/test_trsm.c
@@ -0,0 +1,338 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, c, d;
+	obj_t    c_save;
+	obj_t    alpha;
+	dim_t    m, n;
+	dim_t    p;
+	dim_t    p_begin, p_end, p_inc;
+	int      m_input, n_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	side_t   side;
+	uplo_t   uploa;
+	trans_t  transa;
+	diag_t   diaga;
+	f77_char f77_side;
+	f77_char f77_uploa;
+	f77_char f77_transa;
+	f77_char f77_diaga;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+	p_begin = P_BEGIN;
+	p_end   = P_END;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 1
+
+	//k_input = 256;
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+#if 0
+	side   = BLIS_LEFT;
+#else
+	side   = BLIS_RIGHT;
+#endif
+#if 0
+	uploa  = BLIS_LOWER;
+#else
+	uploa  = BLIS_UPPER;
+#endif
+	transa = BLIS_NO_TRANSPOSE;
+	diaga  = BLIS_NONUNIT_DIAG;
+
+	bli_param_map_blis_to_netlib_side( side, &f77_side );
+	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
+#else
+	printf( "data_%s_%ctrsm_%s",      THR_STR, dt_ch, STR );
+#endif
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &a );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &a );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		//bli_obj_create( dt, m, n, n, 1, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		if ( bli_does_trans( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &d );
+        else
+			bli_obj_create( dt, n, n, 0, 0, &d );
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
+		bli_obj_set_uplo( uploa, &a );
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_diag( diaga, &a );
+
+		bli_randm( &a );
+		bli_mktrim( &a );
+
+		bli_setd( &BLIS_TWO, &d );
+		bli_addd( &d, &a );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_trsm( side,
+			          &alpha,
+			          &a,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			float*    alphap = bli_obj_buffer( &alpha );
+			float*    ap     = bli_obj_buffer( &a );
+			float*    cp     = bli_obj_buffer( &c );
+
+			strsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			double*   alphap = bli_obj_buffer( &alpha );
+			double*   ap     = bli_obj_buffer( &a );
+			double*   cp     = bli_obj_buffer( &c );
+
+			dtrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width( &c );
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* cp     = bli_obj_buffer( &c );
+
+			ctrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int    mm     = bli_obj_length( &c );
+			f77_int    kk     = bli_obj_width( &c );
+			f77_int    lda    = bli_obj_col_stride( &a );
+			f77_int    ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			ztrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		if ( bli_is_left( side ) )
+			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
+		else
+			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
+#else
+		printf( "data_%s_%ctrsm_%s",      THR_STR, dt_ch, STR );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n, gflops );
+
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+		bli_obj_free( &d );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
diff --git a/test/thread_ranges/test_ranges.c b/test/thread_ranges/test_ranges.c
index 68ffe7fec..9bf293ca5 100644
--- a/test/thread_ranges/test_ranges.c
+++ b/test/thread_ranges/test_ranges.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -290,13 +291,13 @@ int main( int argc, char** argv )
 			thrinfo.work_id = t;
 
 			if      ( part_n_dim && go_fwd )
-				area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
 			else if ( part_n_dim && go_bwd )
-				area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
 			else if ( part_m_dim && go_fwd )
-				area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
 			else // ( part_m_dim && go_bwd )
-				area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
+				area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
 
 			width = end - start;
 
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 230b65820..59911d4ed 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -752,19 +752,73 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	// If bli_info_get_int_type_size() returns 32 or 64, the size is forced.
 	// Otherwise, the size is chosen automatically. We query the result of
 	// that automatic choice via sizeof(gint_t).
-/*	
-	if ( bli_info_get_int_type_size() == 32 ||
-	     bli_info_get_int_type_size() == 64 )
-		sprintf( int_type_size_str, "%d", ( int )bli_info_get_int_type_size() );
-	else
-		sprintf( int_type_size_str, "%d", ( int )sizeof(gint_t) * 8 );
-*/
 	if ( bli_info_get_int_type_size() == 32 ||
 	     bli_info_get_int_type_size() == 64 )
 		int_type_size = bli_info_get_int_type_size();
 	else
 		int_type_size = sizeof(gint_t) * 8;
 
+	char impl_str[16];
+	char jrir_str[16];
+
+	// Describe the threading implementation.
+	if      ( bli_info_get_enable_openmp()   ) sprintf( impl_str, "openmp" );
+	else if ( bli_info_get_enable_pthreads() ) sprintf( impl_str, "pthreads" );
+	else    /* threading disabled */           sprintf( impl_str, "disabled" );
+
+	// Describe the status of jrir thread partitioning.
+	if   ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" );
+	else /*bli_info_get_thread_part_jrir_rr()*/   sprintf( jrir_str, "round-robin" );
+
+	char nt_str[16];
+	char jc_nt_str[16];
+	char pc_nt_str[16];
+	char ic_nt_str[16];
+	char jr_nt_str[16];
+	char ir_nt_str[16];
+
+	// Query the number of ways of parallelism per loop (and overall) and
+	// convert these values into strings, with "unset" being used if the
+	// value returned was -1 (indicating the environment variable was unset).
+	dim_t nt    = bli_thread_get_num_threads();
+	dim_t jc_nt = bli_thread_get_jc_nt(); 
+	dim_t pc_nt = bli_thread_get_pc_nt(); 
+	dim_t ic_nt = bli_thread_get_ic_nt(); 
+	dim_t jr_nt = bli_thread_get_jr_nt(); 
+	dim_t ir_nt = bli_thread_get_ir_nt(); 
+
+	if (    nt == -1 ) sprintf(    nt_str, "unset" );
+	else               sprintf(    nt_str, "%d", ( int )   nt );
+	if ( jc_nt == -1 ) sprintf( jc_nt_str, "unset" );
+	else               sprintf( jc_nt_str, "%d", ( int )jc_nt );
+	if ( pc_nt == -1 ) sprintf( pc_nt_str, "unset" );
+	else               sprintf( pc_nt_str, "%d", ( int )pc_nt );
+	if ( ic_nt == -1 ) sprintf( ic_nt_str, "unset" );
+	else               sprintf( ic_nt_str, "%d", ( int )ic_nt );
+	if ( jr_nt == -1 ) sprintf( jr_nt_str, "unset" );
+	else               sprintf( jr_nt_str, "%d", ( int )jr_nt );
+	if ( ir_nt == -1 ) sprintf( ir_nt_str, "unset" );
+	else               sprintf( ir_nt_str, "%d", ( int )ir_nt );
+
+	// Set up rntm_t objects for each of the four families:
+	// gemm, herk, trmm, trsm.
+	rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
+	dim_t  m = 1000, n = 1000, k = 1000;
+
+	bli_thread_init_rntm( &gemm   );
+	bli_thread_init_rntm( &herk   );
+	bli_thread_init_rntm( &trmm_l );
+	bli_thread_init_rntm( &trmm_r );
+	bli_thread_init_rntm( &trsm_l );
+	bli_thread_init_rntm( &trsm_r );
+
+	bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT,  m, n, k, &gemm );
+	bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT,  m, n, k, &herk );
+	bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_LEFT,  m, n, k, &trmm_l );
+	bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_RIGHT, m, n, k, &trmm_r );
+	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_LEFT,  m, n, k, &trsm_l );
+	bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_RIGHT, m, n, k, &trsm_r );
+
 	// Output some system parameters.
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- BLIS library info -------------------------------------\n" );
@@ -799,12 +853,62 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "CBLAS compatibility layer        \n" );
 	libblis_test_fprintf_c( os, "  enabled?                     %d\n", ( int )bli_info_get_enable_cblas() );
 	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "libmemkind                       \n" );
+	libblis_test_fprintf_c( os, "  enabled?                     %d\n", ( int )bli_info_get_enable_memkind() );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "gemm sandbox                     \n" );
+	libblis_test_fprintf_c( os, "  enabled?                     %d\n", ( int )bli_info_get_enable_sandbox() );
+	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "floating-point types           s       d       c       z \n" );
 	libblis_test_fprintf_c( os, "  sizes (bytes)          %7u %7u %7u %7u\n", sizeof(float),
 	                                                                          sizeof(double),
 	                                                                          sizeof(scomplex),
 	                                                                          sizeof(dcomplex) );
 	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "--- BLIS parallelization info ---\n" );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "multithreading                 %s\n", impl_str );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "thread auto-factorization        \n" );
+	libblis_test_fprintf_c( os, "  m dim thread ratio           %d\n", ( int )BLIS_THREAD_RATIO_M );
+	libblis_test_fprintf_c( os, "  n dim thread ratio           %d\n", ( int )BLIS_THREAD_RATIO_N );
+	libblis_test_fprintf_c( os, "  jr max threads               %d\n", ( int )BLIS_THREAD_MAX_JR );
+	libblis_test_fprintf_c( os, "  ir max threads               %d\n", ( int )BLIS_THREAD_MAX_IR );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "ways of parallelism     nt    jc    pc    ic    jr    ir\n" );
+	libblis_test_fprintf_c( os, "  environment        %5s %5s %5s %5s %5s %5s\n",
+	                                                               nt_str, jc_nt_str, pc_nt_str,
+	                                                            ic_nt_str, jr_nt_str, ir_nt_str );
+	libblis_test_fprintf_c( os, "  gemm   (m,n,k=1000)      %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &gemm ), ( int )bli_rntm_pc_ways( &gemm ),
+	                                ( int )bli_rntm_ic_ways( &gemm ),
+	                                ( int )bli_rntm_jr_ways( &gemm ), ( int )bli_rntm_ir_ways( &gemm ) );
+	libblis_test_fprintf_c( os, "  herk   (m,k=1000)        %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &herk ), ( int )bli_rntm_pc_ways( &herk ),
+	                                ( int )bli_rntm_ic_ways( &herk ),
+	                                ( int )bli_rntm_jr_ways( &herk ), ( int )bli_rntm_ir_ways( &herk ) );
+	libblis_test_fprintf_c( os, "  trmm_l (m,n=1000)        %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &trmm_l ), ( int )bli_rntm_pc_ways( &trmm_l ),
+	                                ( int )bli_rntm_ic_ways( &trmm_l ),
+	                                ( int )bli_rntm_jr_ways( &trmm_l ), ( int )bli_rntm_ir_ways( &trmm_l ) );
+	libblis_test_fprintf_c( os, "  trmm_r (m,n=1000)        %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &trmm_r ), ( int )bli_rntm_pc_ways( &trmm_r ),
+	                                ( int )bli_rntm_ic_ways( &trmm_r ),
+	                                ( int )bli_rntm_jr_ways( &trmm_r ), ( int )bli_rntm_ir_ways( &trmm_r ) );
+	libblis_test_fprintf_c( os, "  trsm_l (m,n=1000)        %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &trsm_l ), ( int )bli_rntm_pc_ways( &trsm_l ),
+	                                ( int )bli_rntm_ic_ways( &trsm_l ),
+	                                ( int )bli_rntm_jr_ways( &trsm_l ), ( int )bli_rntm_ir_ways( &trsm_l ) );
+	libblis_test_fprintf_c( os, "  trsm_r (m,n=1000)        %5d %5d %5d %5d %5d\n",
+	                                ( int )bli_rntm_jc_ways( &trsm_r ), ( int )bli_rntm_pc_ways( &trsm_r ),
+	                                ( int )bli_rntm_ic_ways( &trsm_r ),
+	                                ( int )bli_rntm_jr_ways( &trsm_r ), ( int )bli_rntm_ir_ways( &trsm_r ) );
+	libblis_test_fprintf_c( os, "\n" );
+	libblis_test_fprintf_c( os, "thread partitioning              \n" );
+	//libblis_test_fprintf_c( os, "  jc/ic loops                  %s\n", "slab" );
+	libblis_test_fprintf_c( os, "  jr/ir loops                  %s\n", jrir_str );
+	libblis_test_fprintf_c( os, "\n" );
 
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "--- BLIS default implementations ---\n" );
diff --git a/windows/build/libblis-symbols.def b/windows/build/libblis-symbols.def
index 983292b05..13ae1c60c 100644
--- a/windows/build/libblis-symbols.def
+++ b/windows/build/libblis-symbols.def
@@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt
 bli_thread_get_jr_nt
 bli_thread_get_num_threads
 bli_thread_get_pc_nt
-bli_thread_get_range_b2t
-bli_thread_get_range_l2r
-bli_thread_get_range_mdim
-bli_thread_get_range_ndim
-bli_thread_get_range_r2l
-bli_thread_get_range_sub
-bli_thread_get_range_t2b
-bli_thread_get_range_weighted_b2t
-bli_thread_get_range_weighted_l2r
-bli_thread_get_range_weighted_r2l
-bli_thread_get_range_weighted_sub
-bli_thread_get_range_weighted_t2b
-bli_thread_get_range_width_l
+bli_thread_range_b2t
+bli_thread_range_l2r
+bli_thread_range_mdim
+bli_thread_range_ndim
+bli_thread_range_r2l
+bli_thread_range_sub
+bli_thread_range_t2b
+bli_thread_range_weighted_b2t
+bli_thread_range_weighted_l2r
+bli_thread_range_weighted_r2l
+bli_thread_range_weighted_sub
+bli_thread_range_weighted_t2b
+bli_thread_range_width_l
 bli_thread_init
 bli_thread_init_rntm
 bli_thread_init_rntm_from_env