Added 1m-specific APIs for bp, pb gemm algorithms.

Details: - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the body of bli_gemm_cntl_create() replaced with a call to the former. - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, bli_cntl_free() can check if the thread parameter is NULL, and if so, call the latter, and otherwise call the former. - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in terms of bli_gemm1mxx_cntx_init(), which behaves the same as bli_gemm1m_cntx_init() did before, except that an extra bool parameter (is_pb) is used to support both bp and pb algorithms (including to support the anti-preference field described below). - Added support for "anti-preference" in context. The anti_pref field, when true, will toggle the boolean return value of routines such as bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of causing BLIS to transpose the operation to achieve disagreement (rather than agreement) between the storage of C and the micro-kernel output preference. This disagreement is needed for panel-block implementations, since they induce a transposition of the suboperation immediately before the macro-kernel is called, which changes the apparent storage of C. For now, anti-preference is used only with the pb algorithm for 1m (and not with any other non-1m implementation). - Defined new functions, bli_cntx_l3_ukr_eff_prefers_storage_of() bli_cntx_l3_ukr_eff_dislikes_storage_of() bli_cntx_l3_nat_ukr_eff_prefers_storage_of() bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() which are identical to their non-"eff" (effectively) counterparts except that they take the anti-preference field of the context into account. - Explicitly initialize the anti-pref field to FALSE in bli_gks_cntx_set_l3_nat_ukr_prefs(). - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel in terms of the existing block-panel macro-kernel _ker_var2(). This technique requires inducing transposes on all operands and swapping the A and B. - Changed bli_obj_induce_trans() macro so that pack-related fields are also changed to reflect the induced transposition. - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily specify the 1m algorithm (block-panel or panel-block). - Renamed the following cntx_t-related macros: bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() and updated all instantiations. Also updated the field names in the cntx_t struct. - Comment updates.
2026-05-11 09:39:59 +00:00 · 2017-01-25 16:25:46 -06:00
parent 1d728ccb23
commit 4f61528d56
25 changed files with 892 additions and 187 deletions
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -121,11 +121,11 @@ siz_t bli_packm_init

 	if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
 	{
-		schema = bli_cntx_get_pack_schema_a( cntx );
+		schema = bli_cntx_get_pack_schema_a_block( cntx );
 	}
 	else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
 	{
-		schema = bli_cntx_get_pack_schema_b( cntx );
+		schema = bli_cntx_get_pack_schema_b_panel( cntx );
 	}
 	else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
 	{
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -70,8 +70,8 @@ void bli_l3_cntl_create_if
 	else
 	{
 		// If the user provided a control tree, create a copy and use it
-		// instead (so that it can be used to cache things like pack mem_t
-		// entries).
+		// instead (so that threads can use its local tree as a place to
+		// cache things like pack mem_t entries).
 		*cntl_use = bli_cntl_copy( cntl_orig );
 	}
 }
--- a/frame/3/bli_l3_cntx.c
+++ b/frame/3/bli_l3_cntx.c
@@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
 	                         cntx );

 	// Set the pack_t schemas for native execution.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
-	                             BLIS_PACKED_COL_PANELS,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
 }

 void bli_gemm_cntx_finalize( cntx_t* cntx )
@@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
 	                         cntx );

 	// Set the pack_t schemas for native execution.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
-	                             BLIS_PACKED_COL_PANELS,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
 }

 void bli_trsm_cntx_finalize( cntx_t* cntx )
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create
       opid_t family
     )
 {
-	void* macro_kernel_p = bli_gemm_ker_var2;
+	return bli_gemmbp_cntl_create( family );
+}

+// -----------------------------------------------------------------------------
+
+cntl_t* bli_gemmbp_cntl_create
+     (
+       opid_t family
+     )
+{
+	void* macro_kernel_p = bli_gemm_ker_var2;

 	// Change the macro-kernel if the operation family is herk or trmm.
 	if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
@@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
 	// Create a node for packing matrix A.
 	cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
 	(
-	  bli_gemm_packa,
+	  bli_gemm_packa,  // pack the left-hand operand
 	  bli_packm_blk_var1,
 	  BLIS_MR,
 	  BLIS_KR,
@@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create
 	// Create a node for packing matrix B.
 	cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
 	(
-	  bli_gemm_packb,
+	  bli_gemm_packb,  // pack the right-hand operand
 	  bli_packm_blk_var1,
 	  BLIS_KR,
 	  BLIS_NR,
@@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create
 	return gemm_cntl_vl_mm;
 }

+// -----------------------------------------------------------------------------
+
+cntl_t* bli_gemmpb_cntl_create
+     (
+       opid_t family
+     )
+{
+	void* macro_kernel_p = bli_gemm_ker_var1;
+
+	// Change the macro-kernel if the operation family is herk or trmm.
+	//if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
+	//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
+
+	// Create two nodes for the macro-kernel.
+	cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
+	(
+	  BLIS_MR, // needed for bli_thrinfo_rgrow()
+	  NULL,    // variant function pointer not used
+	  NULL     // no sub-node; this is the leaf of the tree.
+	);
+
+	cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
+	(
+	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
+	  macro_kernel_p,
+	  gemm_cntl_ub_ke
+	);
+
+	// Create a node for packing matrix A (which is really the right-hand
+	// operand "B").
+	cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
+	(
+	  bli_gemm_packb,  // pack the right-hand operand
+	  bli_packm_blk_var1,
+	  BLIS_KR,
+	  BLIS_MR,
+	  FALSE,   // do NOT invert diagonal
+	  FALSE,   // reverse iteration if upper?
+	  FALSE,   // reverse iteration if lower?
+	  BLIS_PACKED_COL_PANELS,
+	  BLIS_BUFFER_FOR_A_BLOCK,
+	  gemm_cntl_pb_ub
+	);
+
+	// Create a node for partitioning the n dimension by MC.
+	cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
+	(
+	  BLIS_MC,
+	  bli_gemm_blk_var2,
+	  gemm_cntl_packb
+	);
+
+	// Create a node for packing matrix B (which is really the left-hand
+	// operand "A").
+	cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
+	(
+	  bli_gemm_packa,  // pack the left-hand operand
+	  bli_packm_blk_var1,
+	  BLIS_NR,
+	  BLIS_KR,
+	  FALSE,   // do NOT invert diagonal
+	  FALSE,   // reverse iteration if upper?
+	  FALSE,   // reverse iteration if lower?
+	  BLIS_PACKED_ROW_PANELS,
+	  BLIS_BUFFER_FOR_B_PANEL,
+	  gemm_cntl_op_pb
+	);
+
+	// Create a node for partitioning the k dimension by KC.
+	cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
+	(
+	  BLIS_KC,
+	  bli_gemm_blk_var3,
+	  gemm_cntl_packa
+	);
+
+	// Create a node for partitioning the m dimension by NC.
+	cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
+	(
+	  BLIS_NC,
+	  bli_gemm_blk_var1,
+	  gemm_cntl_mm_op
+	);
+
+	return gemm_cntl_vl_mm;
+}
+
+// -----------------------------------------------------------------------------
+
 void bli_gemm_cntl_free
     (
       cntl_t* cntl,
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create
       opid_t family
     );

+// -----------------------------------------------------------------------------
+
+cntl_t* bli_gemmbp_cntl_create
+     (
+       opid_t family
+     );
+
+cntl_t* bli_gemmpb_cntl_create
+     (
+       opid_t family
+     );
+
+// -----------------------------------------------------------------------------
+
 void bli_gemm_cntl_free
     (
       cntl_t* cntl,
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -46,11 +46,10 @@ void bli_gemm_front
       cntl_t* cntl
     )
 {
+
 #ifdef BLIS_SMALL_MATRIX_ENABLE
-#ifndef BLIS_ENABLE_MULTITHREADING
    gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
    if(BLIS_SUCCESS != status)
-#endif
 #endif
    {
 	    obj_t   a_local;
@@ -90,9 +89,6 @@ void bli_gemm_front
 		    bli_obj_induce_trans( c_local );
 	    }

-	    // Set the operation family id in the context.
-	    bli_cntx_set_family( BLIS_GEMM, cntx );
-
 	    // Record the threading for each level within the context.
 	    bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
                                       bli_obj_length( c_local ),
@@ -103,6 +99,7 @@ void bli_gemm_front
 	    bli_l3_thread_decorator
 	    (
 	      bli_gemm_int,
+	      BLIS_GEMM, // operation family id
 	      alpha,
 	      &a_local,
 	      &b_local,
--- a/frame/3/gemm/bli_gemm_ker_var1.c
+++ b/frame/3/gemm/bli_gemm_ker_var1.c
@@ -0,0 +1,56 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_gemm_ker_var1
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	// Implement _ker_var1() in terms of _ker_var2() by transposing the
+	// entire suboperation (which also requires swapping A and B).
+
+	bli_obj_induce_trans( *a );
+	bli_obj_induce_trans( *b );
+	bli_obj_induce_trans( *c );
+
+	bli_gemm_ker_var2( b, a, c, cntx, cntl, thread );
+}
+
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 )
 GENPROT( gemm_packa )
 GENPROT( gemm_packb )

+GENPROT( gemm_ker_var1 )
 GENPROT( gemm_ker_var2 )

 // Headers for induced algorithms:
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -97,6 +97,16 @@ void bli_cntl_free
       cntl_t* cntl,
       thrinfo_t* thread
     )
+{
+	if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread );
+	else                  bli_cntl_free_wo_thrinfo( cntl );
+}
+
+void bli_cntl_free_w_thrinfo
+     (
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
 {
 	// Base case: simply return when asked to free NULL nodes.
 	if ( cntl == NULL ) return;
@@ -112,7 +122,7 @@ void bli_cntl_free
 	{
 		// Recursively free all memory associated with the sub-node and its
 		// children.
-		bli_cntl_free( cntl_sub_node, thread_sub_node );
+		bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node );
 	}

 	// Free the current node's params field, if it is non-NULL.
@@ -122,8 +132,8 @@ void bli_cntl_free
 	}

 	// Release the current node's pack mem_t entry back to the memory
-	// broker from which it originated, but only if the current thread
-	// is chief for its group, and only if the mem_t is allocated.
+	// broker from which it originated, but only if the mem_t entry is
+	// allocated, and only if the current thread is chief for its group.
 	if ( bli_thread_am_ochief( thread ) )
 	if ( bli_mem_is_alloc( cntl_pack_mem ) )
 	{
@@ -134,6 +144,42 @@ void bli_cntl_free
 	bli_cntl_obj_free( cntl );
 }

+void bli_cntl_free_wo_thrinfo
+     (
+       cntl_t* cntl
+     )
+{
+	// Base case: simply return when asked to free NULL nodes.
+	if ( cntl == NULL ) return;
+
+	cntl_t*    cntl_sub_node   = bli_cntl_sub_node( cntl );
+	void*      cntl_params     = bli_cntl_params( cntl );
+	mem_t*     cntl_pack_mem   = bli_cntl_pack_mem( cntl );
+
+	{
+		// Recursively free all memory associated with the sub-node and its
+		// children.
+		bli_cntl_free_wo_thrinfo( cntl_sub_node );
+	}
+
+	// Free the current node's params field, if it is non-NULL.
+	if ( cntl_params != NULL )
+	{
+		bli_free_intl( cntl_params );
+	}
+
+	// Release the current node's pack mem_t entry back to the memory
+	// broker from which it originated, but only if the mem_t entry is
+	// allocated.
+	if ( bli_mem_is_alloc( cntl_pack_mem ) )
+	{
+		bli_membrk_release( cntl_pack_mem );
+	}
+
+	// Free the current node.
+	bli_cntl_obj_free( cntl );
+}
+
 // -----------------------------------------------------------------------------

 cntl_t* bli_cntl_copy
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -75,12 +75,25 @@ void bli_cntl_obj_clear
       cntl_t* cntl
     );

+// -----------------------------------------------------------------------------
+
 void bli_cntl_free
     (
       cntl_t* cntl,
       thrinfo_t* thread
     );

+void bli_cntl_free_w_thrinfo
+     (
+       cntl_t* cntl,
+       thrinfo_t* thread
+     );
+
+void bli_cntl_free_wo_thrinfo
+     (
+       cntl_t* cntl
+     );
+
 cntl_t* bli_cntl_copy
     (
       cntl_t* cntl
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx )
 	return bli_cntx_method( cntx );
 }

-pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx )
+pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx )
 {
-	return bli_cntx_schema_a( cntx );
+	return bli_cntx_schema_a_block( cntx );
 }

-pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
+pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx )
 {
-	return bli_cntx_schema_b( cntx );
+	return bli_cntx_schema_b_panel( cntx );
+}
+
+pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx )
+{
+	return bli_cntx_schema_c_panel( cntx );
+}
+
+bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx )
+{
+	return bli_cntx_anti_pref( cntx );
 }
 #endif

@@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t   method,
 	bli_cntx_set_method( method, cntx );
 }

-void bli_cntx_set_pack_schema_ab( pack_t  schema_a,
-                                  pack_t  schema_b,
-                                  cntx_t* cntx )
+void bli_cntx_set_pack_schema_ab_blockpanel( pack_t  schema_a,
+                                             pack_t  schema_b,
+                                             cntx_t* cntx )
 {
-	bli_cntx_set_schema_a( schema_a, cntx );
-	bli_cntx_set_schema_b( schema_b, cntx );
+	bli_cntx_set_schema_a_block( schema_a, cntx );
+	bli_cntx_set_schema_b_panel( schema_b, cntx );
 }

-void bli_cntx_set_pack_schema_a( pack_t  schema_a,
-                                 cntx_t* cntx )
+void bli_cntx_set_pack_schema_a_block( pack_t  schema_a,
+                                       cntx_t* cntx )
 {
-	bli_cntx_set_schema_a( schema_a, cntx );
+	bli_cntx_set_schema_a_block( schema_a, cntx );
 }

-void bli_cntx_set_pack_schema_b( pack_t  schema_b,
-                                 cntx_t* cntx )
+void bli_cntx_set_pack_schema_b_panel( pack_t  schema_b,
+                                       cntx_t* cntx )
 {
-	bli_cntx_set_schema_b( schema_b, cntx );
+	bli_cntx_set_schema_b_panel( schema_b, cntx );
 }

-void bli_cntx_set_pack_schema_c( pack_t  schema_c,
+void bli_cntx_set_pack_schema_c_panel( pack_t  schema_c,
+                                       cntx_t* cntx )
+{
+	bli_cntx_set_schema_c_panel( schema_c, cntx );
+}
+
+#if 0
+void bli_cntx_set_ukr_anti_pref( bool_t  anti_pref,
                                 cntx_t* cntx )
 {
-	bli_cntx_set_schema_c( schema_c, cntx );
+	bli_cntx_set_anti_pref( anti_pref, cntx );
 }
+#endif

 void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
                                    dim_t m, dim_t n, dim_t k )
@@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t*  obj,
 	return r_val;
 }

+bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t*  obj,
+                                                   l3ukr_t ukr_id,
+                                                   cntx_t* cntx )
+{
+	bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
+
+	// If the anti-preference is set, negate the result.
+	if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
+
+	return r_val;
+}
+
+bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t*  obj,
+                                                    l3ukr_t ukr_id,
+                                                    cntx_t* cntx )
+{
+	bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
+
+	// If the anti-preference is set, negate the result.
+	if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
+
+	return r_val;
+}
+
+// -----------------------------------------------------------------------------
+
 bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t   dt,
                                        l3ukr_t ukr_id,
                                        cntx_t* cntx )
@@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t*  obj,
 	return r_val;
 }

+bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t*  obj,
+                                               l3ukr_t ukr_id,
+                                               cntx_t* cntx )
+{
+	bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
+
+	// If the anti-preference is set, negate the result.
+	if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
+
+	return r_val;
+}
+
+bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t*  obj,
+                                                l3ukr_t ukr_id,
+                                                cntx_t* cntx )
+{
+	bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
+
+	// If the anti-preference is set, negate the result.
+	if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
+
+	return r_val;
+}
+
 // -----------------------------------------------------------------------------

 void bli_cntx_print( cntx_t* cntx )
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -59,6 +59,8 @@ typedef struct cntx_s
 	pack_t    schema_b;
 	pack_t    schema_c;

+	bool_t    anti_pref;
+
 	dim_t*    thrloop;

 	membrk_t* membrk;
@@ -113,26 +115,30 @@ typedef struct cntx_s
 \
 	( (cntx)->method )

-#define bli_cntx_schema_a( cntx ) \
+#define bli_cntx_schema_a_block( cntx ) \
 \
-	( (cntx)->schema_a )
+	( (cntx)->schema_a_block )

-#define bli_cntx_schema_b( cntx ) \
+#define bli_cntx_schema_b_panel( cntx ) \
 \
-	( (cntx)->schema_b )
+	( (cntx)->schema_b_panel )

-#define bli_cntx_schema_c( cntx ) \
+#define bli_cntx_schema_c_panel( cntx ) \
 \
-	( (cntx)->schema_c )
+	( (cntx)->schema_c_panel )

-#define bli_cntx_membrk( cntx ) \
+#define bli_cntx_anti_pref( cntx ) \
 \
-	( (cntx)->membrk )
+	( (cntx)->anti_pref )

 #define bli_cntx_thrloop( cntx ) \
 \
 	( (cntx)->thrloop )

+#define bli_cntx_membrk( cntx ) \
+\
+	( (cntx)->membrk )
+
 #if 1
 #define bli_cntx_jc_way( cntx ) \
 \
@@ -211,24 +217,24 @@ typedef struct cntx_s
 	(cntx_p)->method = _method; \
 }

-#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
+#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \
 { \
-	(cntx_p)->schema_a = _schema_a; \
+	(cntx_p)->schema_a_block = _schema_a_block; \
 }

-#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
+#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \
 { \
-	(cntx_p)->schema_b = _schema_b; \
+	(cntx_p)->schema_b_panel = _schema_b_panel; \
 }

-#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
+#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \
 { \
-	(cntx_p)->schema_c = _schema_c; \
+	(cntx_p)->schema_c_panel = _schema_c_panel; \
 }

-#define bli_cntx_set_membrk( _membrk, cntx_p ) \
+#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \
 { \
-	(cntx_p)->membrk = _membrk; \
+	(cntx_p)->anti_pref = _anti_pref; \
 }

 #define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
@@ -241,6 +247,11 @@ typedef struct cntx_s
 	(cntx_p)->thrloop[ BLIS_KR ] = 1;   \
 }

+#define bli_cntx_set_membrk( _membrk, cntx_p ) \
+{ \
+	(cntx_p)->membrk = _membrk; \
+}
+
 // cntx_t query (complex)

 #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
@@ -323,13 +334,17 @@ typedef struct cntx_s
 \
 	bli_cntx_method( cntx )

-#define bli_cntx_get_pack_schema_a( cntx ) \
+#define bli_cntx_get_pack_schema_a_block( cntx ) \
 \
-	bli_cntx_schema_a( cntx )
+	bli_cntx_schema_a_block( cntx )

-#define bli_cntx_get_pack_schema_b( cntx ) \
+#define bli_cntx_get_pack_schema_b_panel( cntx ) \
 \
-	bli_cntx_schema_b( cntx )
+	bli_cntx_schema_b_panel( cntx )
+
+#define bli_cntx_get_pack_schema_c_panel( cntx ) \
+\
+	bli_cntx_schema_c_panel( cntx )

 #define bli_cntx_get_membrk( cntx ) \
 \
@@ -395,9 +410,10 @@ func_t*  bli_cntx_get_packm_ukr( cntx_t* cntx );
 //                                  l1vkr_t ker_id,
 //                                  cntx_t* cntx );
 //ind_t    bli_cntx_get_ind_method( cntx_t* cntx );
-//pack_t   bli_cntx_get_pack_schema_a( cntx_t* cntx );
-//pack_t   bli_cntx_get_pack_schema_b( cntx_t* cntx );
-//pack_t   bli_cntx_get_pack_schema_c( cntx_t* cntx );
+//pack_t   bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
+//pack_t   bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
+//pack_t   bli_cntx_get_pack_schema_c_panel( cntx_t* cntx );
+//bool_t   bli_cntx_get_ukr_anti_pref( cntx_t* cntx );
 dim_t    bli_cntx_get_num_threads( cntx_t* cntx );
 dim_t    bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );

@@ -425,15 +441,17 @@ void     bli_cntx_set_packm_ukr( func_t* func,
                                 cntx_t* cntx );
 void     bli_cntx_set_ind_method( ind_t   method,
                                  cntx_t* cntx );
-void     bli_cntx_set_pack_schema_ab( pack_t  schema_a,
-                                      pack_t  schema_b,
-                                      cntx_t* cntx );
-void     bli_cntx_set_pack_schema_a( pack_t  schema_a,
-                                     cntx_t* cntx );
-void     bli_cntx_set_pack_schema_b( pack_t  schema_b,
-                                     cntx_t* cntx );
-void     bli_cntx_set_pack_schema_c( pack_t  schema_c,
-                                     cntx_t* cntx );
+void     bli_cntx_set_pack_schema_ab_blockpanel( pack_t  schema_a,
+                                                 pack_t  schema_b,
+                                                 cntx_t* cntx );
+void     bli_cntx_set_pack_schema_a_block( pack_t  schema_a,
+                                           cntx_t* cntx );
+void     bli_cntx_set_pack_schema_b_panel( pack_t  schema_b,
+                                           cntx_t* cntx );
+void     bli_cntx_set_pack_schema_c_panel( pack_t  schema_c,
+                                           cntx_t* cntx );
+//void     bli_cntx_set_ukr_anti_pref( bool_t  anti_pref,
+//                                     cntx_t* cntx );
 void     bli_cntx_set_thrloop_from_env( opid_t  l3_op,
                                        side_t  side,
                                        cntx_t* cntx,
@@ -455,6 +473,12 @@ bool_t   bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t*  obj,
 bool_t   bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t*  obj,
                                                  l3ukr_t ukr_id,
                                                  cntx_t* cntx );
+bool_t   bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t*  obj,
+                                                     l3ukr_t ukr_id,
+                                                     cntx_t* cntx );
+bool_t   bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t*  obj,
+                                                      l3ukr_t ukr_id,
+                                                      cntx_t* cntx );
 bool_t   bli_cntx_l3_ukr_prefers_rows_dt( num_t   dt,
                                          l3ukr_t ukr_id,
                                          cntx_t* cntx );
@@ -467,6 +491,12 @@ bool_t   bli_cntx_l3_ukr_prefers_storage_of( obj_t*  obj,
 bool_t   bli_cntx_l3_ukr_dislikes_storage_of( obj_t*  obj,
                                              l3ukr_t ukr_id,
                                              cntx_t* cntx );
+bool_t   bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t*  obj,
+                                                 l3ukr_t ukr_id,
+                                                 cntx_t* cntx );
+bool_t   bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t*  obj,
+                                                  l3ukr_t ukr_id,
+                                                  cntx_t* cntx );

 // print function

--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
 	mbool_t* cntx_l3_nat_ukr_pref  = &cntx_l3_nat_ukr_prefs[ ukr ];

 	bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref );
+
+	// Explicitly set the anti-preference to FALSE.
+	bli_cntx_set_anti_pref( FALSE, cntx );
 }


--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
 	(obj).n_panel = n0; \
 }

+#define bli_obj_set_panel_dims( m0, n0, obj ) \
+{ \
+	bli_obj_set_panel_length( m0, obj ); \
+	bli_obj_set_panel_width( n0, obj ); \
+}
+
 #define bli_obj_set_panel_dim( panel_dim, obj ) \
 { \
 	(obj).pd = panel_dim; \
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
 #define bli_obj_induce_trans( obj ) \
 { \
 	{ \
+		/* Induce transposition among basic fields. */ \
 		dim_t  m_        = bli_obj_length( obj ); \
 		dim_t  n_        = bli_obj_width( obj ); \
 		inc_t  rs_       = bli_obj_row_stride( obj ); \
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
 \
 		if ( bli_obj_is_upper_or_lower( obj ) ) \
 			bli_obj_toggle_uplo( obj ); \
+\
+		/* Induce transposition among packed fields. */ \
+		dim_t  m_padded_ = bli_obj_padded_length( obj ); \
+		dim_t  n_padded_ = bli_obj_padded_width( obj ); \
+		dim_t  m_panel_  = bli_obj_panel_length( obj ); \
+		dim_t  n_panel_  = bli_obj_panel_width( obj ); \
+\
+		bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
+		bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
 \
 		/* Note that this macro DOES NOT touch the transposition bit! If
 		   the calling code is using this macro to handle an object whose
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -975,9 +975,11 @@ typedef struct cntx_s

 	opid_t    family;
 	ind_t     method;
-	pack_t    schema_a;
-	pack_t    schema_b;
-	pack_t    schema_c;
+	pack_t    schema_a_block;
+	pack_t    schema_b_panel;
+	pack_t    schema_c_panel;
+
+	bool_t    anti_pref;

 	dim_t     thrloop[ BLIS_NUM_LOOPS ];

--- a/frame/ind/cntx/bli_gemmind_cntx.c
+++ b/frame/ind/cntx/bli_gemmind_cntx.c
@@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
-	                             BLIS_PACKED_COL_PANELS_3MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
 }

 void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS,
-	                             BLIS_PACKED_COL_PANELS_3MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
 }

 void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
-	                             BLIS_PACKED_COL_PANELS_3MS,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx );
 }

 void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx )
 	// Set the pack_t schemas as a function of the stage of execution.
 	if ( stage == 0 )
 	{
-		bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
 	}
 	else if ( stage == 1 )
 	{
-		bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
 	}
 	else // if ( stage == 2 )
 	{
-		bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
 	}
 }

@@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
-	                             0, // not yet needed; varies with _stage()
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
+	bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
 }

 void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx )
 	// Set the pack_t schemas as a function of the stage of execution.
 	if ( stage == 0 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
-		                             BLIS_PACKED_COL_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
 	}
 	else if ( stage == 1 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
-		                             BLIS_PACKED_COL_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
 	}
 	else // if ( stage == 2 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI,
-		                             BLIS_PACKED_COL_PANELS_RPI, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx );
 	}
 }

@@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
-	                             BLIS_PACKED_COL_PANELS_4MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
 }

 void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
-	                             BLIS_PACKED_COL_PANELS_4MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
+	bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
 }

 void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for the current induced method.
-	bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage()
-	                             0, // not yet needed; varies with _stage()
-	                             cntx );
+	bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage()
+	bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage()
 }

 void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
@@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx )
 	// Set the pack_t schemas as a function of the stage of execution.
 	if ( stage == 0 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
-		                             BLIS_PACKED_COL_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
 	}
 	else if ( stage == 1 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
-		                             BLIS_PACKED_COL_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
 	}
 	else if ( stage == 2 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO,
-		                             BLIS_PACKED_COL_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx );
 	}
 	else // if ( stage == 3 )
 	{
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO,
-		                             BLIS_PACKED_COL_PANELS_RO, cntx );
+		bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx );
+		bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx );
 	}
 }

@@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx )
 // -----------------------------------------------------------------------------

 void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
+{
+	// Default to context for block-panel algorithm.
+	bli_gemm1mbp_cntx_init( dt, cntx );
+}
+
+void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx )
+{
+	bli_gemm1mxx_cntx_init( dt, FALSE, cntx );
+}
+
+void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx )
+{
+	bli_gemm1mxx_cntx_init( dt, TRUE, cntx );
+}
+
+void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx )
 {
 	const ind_t method = BLIS_1M;

@@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
 	// Initialize the context with packm-related kernels.
 	bli_packm_cntx_init( dt, cntx );

+	// Initialize the blocksizes according to the micro-kernel preference as
+	// well as the algorithm.
 	if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
 	{
+		// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
+
+		// Set the pack_t schemas for the c_bp or r_pb algorithms.
+		if ( !is_pb )
+		{
+			bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
+			bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
+		}
+		else // if ( is_pb )
+		{
+			bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
+			bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
+		}
+
 		// Initialize the context with the current architecture's register
 		// and cache blocksizes (and multiples), and the induced method.
 		bli_gks_cntx_set_blkszs
@@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
 		  BLIS_KR, BLIS_KR, 1.0, 1.0,
 		  cntx
 		);
-
-		// Set the pack_t schemas for the current induced method.
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
-		                             BLIS_PACKED_COL_PANELS_1R,
-		                             cntx );
 	}
 	else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
 	{
+		// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
+
+		// Set the pack_t schemas for the r_bp or c_pb algorithms.
+		if ( !is_pb )
+		{
+			bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
+			bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
+		}
+		else // if ( is_pb )
+		{
+			bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
+			bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
+		}
+
 		// Initialize the context with the current architecture's register
 		// and cache blocksizes (and multiples), and the induced method.
 		bli_gks_cntx_set_blkszs
@@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx )
 		  BLIS_KR, BLIS_KR, 1.0, 1.0,
 		  cntx
 		);
-
-		// Set the pack_t schemas for the current induced method.
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
-		                             BLIS_PACKED_COL_PANELS_1E,
-		                             cntx );
 	}
+
+	// Set the anti-preference field to TRUE when executing a panel-block
+	// algorithm, and FALSE otherwise. This will cause higher-level generic
+	// code to establish (if needed) disagreement between the storage of C and
+	// the micro-kernel output preference so that the two will come back into
+	// agreement in the panel-block macro-kernel (which implemented in terms
+	// of the block-panel macro-kernel with some induced transpositions).
+	bli_cntx_set_anti_pref( is_pb, cntx );
 }

 void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx )
--- a/frame/ind/cntx/bli_gemmind_cntx.h
+++ b/frame/ind/cntx/bli_gemmind_cntx.h
@@ -65,6 +65,9 @@ void  bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx );
 void  bli_gemm4m1_cntx_finalize( cntx_t* cntx );

 void  bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx );
+void  bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx );
+void  bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx );
+void  bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx );
 void  bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx );
 void  bli_gemm1m_cntx_finalize( cntx_t* cntx );

--- a/frame/ind/cntx/bli_trsmind_cntx.c
+++ b/frame/ind/cntx/bli_trsmind_cntx.c
@@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for native execution.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI,
-	                             BLIS_PACKED_COL_PANELS_3MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI,
+	                                        BLIS_PACKED_COL_PANELS_3MI,
+	                                        cntx );
 }

 void bli_trsm3m1_cntx_finalize( cntx_t* cntx )
@@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
 	);

 	// Set the pack_t schemas for native execution.
-	bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI,
-	                             BLIS_PACKED_COL_PANELS_4MI,
-	                             cntx );
+	bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI,
+	                                        BLIS_PACKED_COL_PANELS_4MI,
+	                                        cntx );
 }

 void bli_trsm4m1_cntx_finalize( cntx_t* cntx )
@@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
 		);

 		// Set the pack_t schemas for the current induced method.
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E,
-		                             BLIS_PACKED_COL_PANELS_1R,
-		                             cntx );
+		bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
+		                                        BLIS_PACKED_COL_PANELS_1R,
+		                                        cntx );
 	}
 	else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
 	{
@@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
 		);

 		// Set the pack_t schemas for the current induced method.
-		bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R,
-		                             BLIS_PACKED_COL_PANELS_1E,
-		                             cntx );
+		bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
+		                                        BLIS_PACKED_COL_PANELS_1E,
+		                                        cntx );
 	}
 }

--- a/frame/ind/oapi/bli_l3_1mbppb_oapi.c
+++ b/frame/ind/oapi/bli_l3_1mbppb_oapi.c
@@ -0,0 +1,85 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+// -- gemmbp/gemmpb ------------------------------------------------------------
+
+#undef  GENFRONT
+#define GENFRONT( opname, imeth, alg ) \
+\
+void PASTEMAC2(opname,imeth,alg) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c  \
+     ) \
+{ \
+	num_t   dt     = bli_obj_datatype( *c ); \
+	cntx_t  cntx; \
+	cntl_t* cntl_p; \
+\
+	/* If the objects are in the real domain, execute the native
+	   implementation. */ \
+	if ( bli_obj_is_real( *c ) ) \
+	{ \
+		PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \
+		return; \
+	} \
+\
+	/* Initialize a local 1m context for the current algorithm (bp or pb). */ \
+	PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx );  \
+\
+	/* Create a control tree for the current algorithm (bp or pb). */ \
+	cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM );  \
+\
+	/* Invoke the operation's front end using the context and control
+	   tree we just created. */ \
+	PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \
+\
+	/* Free the control tree. Since the implementation will only make
+	   copies of it (and not use it directly) we do not need to supply
+	   a thread object. */ \
+	bli_cntl_free( cntl_p, NULL ); \
+\
+	/* Finalize the local context. */ \
+	PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \
+}
+
+// gemm
+GENFRONT( gemm, 1m, bp )
+GENFRONT( gemm, 1m, pb )
+
--- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
+++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
@@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \
 		PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \
 		return; \
 	} \
+\
+	/* A temporary hack to easily specify the 1m algorithm (block-panel or
+	   panel-block). */ \
+	if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \
+	{ \
+		bli_gemm1mbp( alpha, a, b, beta, c ); \
+		return; \
+	} \
 \
 	/* Initialize a local context if the one provided is NULL. */ \
 	bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \
--- a/frame/ind/oapi/bli_l3_ind_oapi.h
+++ b/frame/ind/oapi/bli_l3_ind_oapi.h
@@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 )
 GENPROT_NO2OP( 4mh )
 GENPROT_NO2OP( 4mb )

+
+//
+// Generate object-based prototypes for 1m methods that specify an algorithm
+// (e.g., block-panel or panel-block).
+//
+
+#undef  GENPROT
+#define GENPROT( imeth, alg ) \
+\
+void PASTEMAC2(gemm,imeth,alg) (              obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \
+
+GENPROT( 1m, bp )
+GENPROT( 1m, pb )
+
--- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c
+++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c
@@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(chr,gemm_ukr_ft) \
 	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
 	const bool_t      col_pref  = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const bool_t      row_pref  = !col_pref; \
+	/*const bool_t      row_pref  = !col_pref;*/ \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \
 	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
 	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
 \
-	const ctype_r     beta_r    = PASTEMAC(ch,real)( *beta ); \
-	const ctype_r     beta_i    = PASTEMAC(ch,imag)( *beta ); \
-\
-	ctype_r           beta_use; \
+	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
+	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
 \
 	ctype_r*          c_use; \
 	inc_t             rs_c_use; \
@@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \
 	if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
 		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
+\
+	/* Sanity check: These should never occur because storage/preference
+	   agreement is handled at a higher level. */ \
+	/*
+	if      ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
+	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
+	*/ \
+\
 \
 	/* If beta has a non-zero imaginary component OR if c is stored with
-	   general stride OR if for some reason the storage of c is not the
-	   preferred storage of the micro-kernel, then we compute the
-	   alpha*a*b product into temporary storage and then accumulate that
-	   result into c afterwards. */ \
-	if      ( !PASTEMAC(chr,eq0)( beta_i ) )                using_ct = TRUE; \
-	else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
-	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
+	   general stride, then we compute the alpha*a*b product into temporary
+	   storage and then accumulate that result into c afterwards. Note that
+	   the other two cases concerning disagreement between the storage of C
+	   and the output preference of the micro-kernel, should never occur
+	   (though we could handle them if they did occur). */ \
+	if      ( !PASTEMAC(chr,eq0)( *beta_i ) )               using_ct = TRUE; \
+	/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
+	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
 	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
 	else                                                    using_ct = FALSE; \
 \
 \
 	if ( using_ct ) \
 	{ \
+		/* In the atypical cases, we compute the result into temporary
+		   workspace ct and then accumulated it back to c at the end. */ \
+\
 		/* Set the strides of ct based on the preference of the underlying
 		   native real domain gemm micro-kernel. Note that we set the ct
 		   strides in units of complex elements. */ \
 		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
 		else            { rs_ct = nr; cs_ct = 1; } \
 \
-		beta_use = *zero_r; \
 		c_use    = ( ctype_r* )ct; \
 		rs_c_use = rs_ct; \
 		cs_c_use = cs_ct; \
-	} \
-	else \
-	{ \
-		/* In a typical case, we use the real part of beta and accumulate
-		   directly into the output matrix c. */ \
-		beta_use = beta_r; \
-		c_use    = ( ctype_r* )c; \
-		rs_c_use = rs_c; \
-		cs_c_use = cs_c; \
-	} \
 \
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
+		else                                           rs_c_use *= 2; \
 \
-	/* Convert the strides from being in units of complex elements to
-	   be in units of real elements. Note that we don't need to check for
-	   general storage here because that case corresponds to the scenario
-	   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
-	if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
-	else                                           rs_c_use *= 2; \
+		/* The following gemm micro-kernel call implements the 1m method,
+		   which induces a complex matrix multiplication by calling the
+		   real matrix micro-kernel on micro-panels that have been packed
+		   according to the 1e and 1r formats. */ \
 \
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  k2, \
+		  alpha_r, \
+		  a_r, \
+		  b_r, \
+		  zero_r, \
+		  c_use, rs_c_use, cs_c_use, \
+		  data, \
+		  cntx  \
+		); \
 \
-	/* The following gemm micro-kernel call implements the 1m method,
-	   which induces a complex matrix multiplication by calling the
-	   real matrix micro-kernel on micro-panels that have been packed
-	   according to the 1e and 1r formats. */ \
-\
-	/* c = beta * c + alpha_r * a * b; */ \
-	rgemm_ukr \
-	( \
-	  k2, \
-	  alpha_r, \
-	  a_r, \
-	  b_r, \
-	  &beta_use, \
-	  c_use, rs_c_use, cs_c_use, \
-	  data, \
-	  cntx  \
-	); \
-\
-\
-	/* If necessary, accumulate the final result in ct back to c. */ \
-	if ( using_ct ) \
-	{ \
 		dim_t i, j; \
 \
+		/* Accumulate the final result in ct back to c. */ \
 		for ( j = 0; j < nr; ++j ) \
 		for ( i = 0; i < mr; ++i ) \
 		{ \
@@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \
 			                    *(c  + i*rs_c  + j*cs_c ) ); \
 		} \
 	} \
+	else \
+	{ \
+		/* In the typical cases, we use the real part of beta and
+		   accumulate directly into the output matrix c. */ \
+\
+		c_use    = ( ctype_r* )c; \
+		rs_c_use = rs_c; \
+		cs_c_use = cs_c; \
+\
+		/* Convert the strides from being in units of complex elements to
+		   be in units of real elements. Note that we don't need to check for
+		   general storage here because that case corresponds to the scenario
+		   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+		if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
+		else                                           rs_c_use *= 2; \
+\
+		/* The following gemm micro-kernel call implements the 1m method,
+		   which induces a complex matrix multiplication by calling the
+		   real matrix micro-kernel on micro-panels that have been packed
+		   according to the 1e and 1r formats. */ \
+\
+		/* c = beta * c + alpha_r * a * b; */ \
+		rgemm_ukr \
+		( \
+		  k2, \
+		  alpha_r, \
+		  a_r, \
+		  b_r, \
+		  beta_r, \
+		  c_use, rs_c_use, cs_c_use, \
+		  data, \
+		  cntx  \
+		); \
+	} \
 }

 INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )
--- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev
+++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev
@@ -0,0 +1,188 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, \
+       ctype*     restrict b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	const num_t       dt        = PASTEMAC(ch,type); \
+	const num_t       dt_r      = PASTEMAC(chr,type); \
+\
+	PASTECH(chr,gemm_ukr_ft) \
+	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
+	const bool_t      col_pref  = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	/*const bool_t      row_pref  = !col_pref;*/ \
+\
+	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	const dim_t       nr        = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
+	const dim_t       k2        = 2 * k; \
+\
+	ctype             ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                      / sizeof( ctype_r ) ] \
+	                      __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	inc_t             rs_ct; \
+	inc_t             cs_ct; \
+\
+	ctype_r* restrict a_r       = ( ctype_r* )a; \
+\
+	ctype_r* restrict b_r       = ( ctype_r* )b; \
+\
+	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
+\
+	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
+	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
+\
+	const ctype_r     beta_r    = PASTEMAC(ch,real)( *beta ); \
+	const ctype_r     beta_i    = PASTEMAC(ch,imag)( *beta ); \
+\
+	ctype_r           beta_use; \
+\
+	ctype_r*          c_use; \
+	inc_t             rs_c_use; \
+	inc_t             cs_c_use; \
+\
+	bool_t            using_ct; \
+\
+\
+	/* SAFETY CHECK: The higher level implementation should never
+	   allow an alpha with non-zero imaginary component to be passed
+	   in, because it can't be applied properly using the 1m method.
+	   If alpha is not real, then something is very wrong. */ \
+	if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+\
+\
+	/* Sanity check: These should never occur because storage/preference
+	   agreement is handled at a higher level. */ \
+	/*
+	if      ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
+	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
+	*/ \
+\
+\
+	/* If beta has a non-zero imaginary component OR if c is stored with
+	   general stride, then we compute the alpha*a*b product into temporary
+	   storage and then accumulate that result into c afterwards. Note that
+	   the other two cases concerning disagreement between the storage of C
+	   and the output preference of the micro-kernel, should never occur
+	   (though we could handle them if they did occur). */ \
+	if      ( !PASTEMAC(chr,eq0)( beta_i ) )                using_ct = TRUE; \
+	/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
+	else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
+	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
+	else                                                    using_ct = FALSE; \
+\
+\
+	if ( using_ct ) \
+	{ \
+		/* Set the strides of ct based on the preference of the underlying
+		   native real domain gemm micro-kernel. Note that we set the ct
+		   strides in units of complex elements. */ \
+		if ( col_pref ) { rs_ct = 1;  cs_ct = mr; } \
+		else            { rs_ct = nr; cs_ct = 1; } \
+\
+		beta_use = *zero_r; \
+		c_use    = ( ctype_r* )ct; \
+		rs_c_use = rs_ct; \
+		cs_c_use = cs_ct; \
+	} \
+	else \
+	{ \
+		/* In a typical case, we use the real part of beta and accumulate
+		   directly into the output matrix c. */ \
+		beta_use = beta_r; \
+		c_use    = ( ctype_r* )c; \
+		rs_c_use = rs_c; \
+		cs_c_use = cs_c; \
+	} \
+\
+\
+	/* Convert the strides from being in units of complex elements to
+	   be in units of real elements. Note that we don't need to check for
+	   general storage here because that case corresponds to the scenario
+	   where we are using the ct buffer and its rs_ct/cs_ct strides. */ \
+	if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \
+	else                                           rs_c_use *= 2; \
+\
+\
+	/* The following gemm micro-kernel call implements the 1m method,
+	   which induces a complex matrix multiplication by calling the
+	   real matrix micro-kernel on micro-panels that have been packed
+	   according to the 1e and 1r formats. */ \
+\
+	/* c = beta * c + alpha_r * a * b; */ \
+	rgemm_ukr \
+	( \
+	  k2, \
+	  alpha_r, \
+	  a_r, \
+	  b_r, \
+	  &beta_use, \
+	  c_use, rs_c_use, cs_c_use, \
+	  data, \
+	  cntx  \
+	); \
+\
+\
+	/* If necessary, accumulate the final result in ct back to c. */ \
+	if ( using_ct ) \
+	{ \
+		dim_t i, j; \
+\
+		for ( j = 0; j < nr; ++j ) \
+		for ( i = 0; i < mr; ++i ) \
+		{ \
+			PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
+			                    *beta, \
+			                    *(c  + i*rs_c  + j*cs_c ) ); \
+		} \
+	} \
+}
+
+INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR )
+
--- a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c
+++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c
@@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \
 \
 	const dim_t       packnr      = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
-	const pack_t      schema_b    = bli_cntx_schema_b( cntx ); \
+	const pack_t      schema_b    = bli_cntx_schema_b_panel( cntx ); \
 \
 	const dim_t       k2          = 2 * k; \
 \
--- a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c
+++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c
@@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
 \
-	const pack_t      schema_b = bli_cntx_schema_b( cntx ); \
+	const pack_t      schema_b = bli_cntx_schema_b_panel( cntx ); \
 \
 	dim_t             iter, i, j, l; \
 	dim_t             n_behind; \
@@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
 \
-	const pack_t      schema_b = bli_cntx_schema_b( cntx ); \
+	const pack_t      schema_b = bli_cntx_schema_b_panel( cntx ); \
 \
 	dim_t             iter, i, j, l; \
 	dim_t             n_behind; \