Various level-3 optimizations for row storage.

Details: - Implemented remaining two cases within bli_packm_blk_var2(), which allow packing from a lower or upper-stored symmetric/Hermitian matrix to column panels (which are row-stored). Previously one could only pack to row panels (which are column-stored). - Implemented various optimizations in the level-3 front-ends that allow more favorable access through row-stored matrices for gemm, hemm, herk, her2k, symm, syrk, and syr2k. - Cleaned up code in level-3 front-ends that has to do with setting target and execution datatypes.
2026-05-11 09:39:59 +00:00 · 2013-06-11 12:18:39 -05:00
parent 05a657a6b9
commit 08475e7c76
21 changed files with 871 additions and 408 deletions
--- a/frame/1m/packm/bli_packm_blk_var2.c
+++ b/frame/1m/packm/bli_packm_blk_var2.c
@@ -143,6 +143,7 @@ void PASTEMAC(ch,varname )( \
 	dim_t           panel_len; \
 	doff_t          diagoffc_i; \
 	doff_t          diagoffc_inc; \
+	doff_t          diagoffc_i_abs; \
 	dim_t           panel_dim_i; \
 	inc_t           vs_c; \
 	inc_t           incc, ldc; \
@@ -171,6 +172,7 @@ void PASTEMAC(ch,varname )( \
 	ctype* restrict p11; \
 	dim_t           p11_m; \
 	dim_t           p11_n; \
+	inc_t           rs_p11, cs_p11; \
 \
 \
 	/* Extract the conjugation bit from the transposition argument. */ \
@@ -191,7 +193,7 @@ void PASTEMAC(ch,varname )( \
 	   we are packing to row panels. */ \
 	if ( bli_is_row_stored( rs_p, cs_p ) ) \
 	{ \
-		/* Prepare to pack to column panels. */ \
+		/* Prepare to pack to row-stored column panels. */ \
 		iter_dim     = n; \
 		panel_len    = m; \
 		panel_dim    = pd_p; \
@@ -204,10 +206,12 @@ void PASTEMAC(ch,varname )( \
 		n_panel      = &panel_dim_i; \
 		m_panel_max  = m_max; \
 		n_panel_max  = panel_dim; \
+		rs_p11       = rs_p; \
+		cs_p11       = 1; \
 	} \
 	else /* if ( bli_is_col_stored( rs_p, cs_p ) ) */ \
 	{ \
-		/* Prepare to pack to row panels. */ \
+		/* Prepare to pack to column-stored row panels. */ \
 		iter_dim     = m; \
 		panel_len    = n; \
 		panel_dim    = pd_p; \
@@ -220,6 +224,8 @@ void PASTEMAC(ch,varname )( \
 		n_panel      = &n; \
 		m_panel_max  = panel_dim; \
 		n_panel_max  = n_max; \
+		rs_p11       = 1; \
+		cs_p11       = cs_p; \
 	} \
 \
 	/* Compute the total number of iterations we'll need. */ \
@@ -237,11 +243,11 @@ void PASTEMAC(ch,varname )( \
 	for ( ic  = ic0,    ip  = ip0,    it  = 0; it < num_iter; \
 	      ic += ic_inc, ip += ip_inc, it += 1 ) \
 	{ \
-		panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
+		panel_dim_i    = bli_min( panel_dim, iter_dim - ic ); \
 \
-		diagoffc_i  = diagoffc + (ip  )*diagoffc_inc; \
-		c_begin     = c_cast   + (ic  )*vs_c; \
-		p_begin     = p_cast   + (ip  )*ps_p; \
+		diagoffc_i     = diagoffc + (ip  )*diagoffc_inc; \
+		c_begin        = c_cast   + (ic  )*vs_c; \
+		p_begin        = p_cast   + (ip  )*ps_p; \
 \
 		/* If the current panel intersects the diagonal and C is either
 		   upper- or lower-stored, then we assume C is symmetric or
@@ -252,17 +258,76 @@ void PASTEMAC(ch,varname )( \
 		if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
 		     bli_is_upper_or_lower( uploc ) ) \
 		{ \
-			/* Only two of four cases implemented, since BLIS2 currently does
-			   not support triangular packing of matrix B (which is row-stored). */ \
-			/*if      ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) )  \
+			diagoffc_i_abs = bli_abs( diagoffc_i ); \
+\
+			/*if      ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) */ \
+			if      ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
+			          ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
 			{ \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+				p10_dim    = panel_dim_i; \
+				p10_len    = diagoffc_i_abs; \
+				p10        = p_begin; \
+				c10        = c_begin; \
+				incc10     = incc; \
+				ldc10      = ldc; \
+				conjc10    = conjc; \
+\
+				p12_dim    = panel_dim_i; \
+				p12_len    = panel_len - p10_len; \
+				j          = p10_len; \
+				diagoffc12 = diagoffc_i_abs - j; \
+				p12        = p_begin + (j  )*ldp; \
+				c12        = c_begin + (j  )*ldc; \
+				c12        = c12 + diagoffc12 * ( doff_t )cs_c + \
+				                  -diagoffc12 * ( doff_t )rs_c;  \
+				incc12     = ldc; \
+				ldc12      = incc; \
+				conjc12    = conjc; \
+\
+				p11_m      = panel_dim_i; \
+				p11_n      = panel_dim_i; \
+				j          = diagoffc_i_abs; \
+				p11        = p_begin + (j  )*ldp; \
+				c11        = c_begin + (j  )*ldc; \
+\
+				if ( bli_is_hermitian( strucc ) ) \
+					bli_toggle_conj( conjc12 ); \
 			} \
-			else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
+			/*else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
+			else /* if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
+			             ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
 			{ \
-				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
+				p10_dim    = panel_dim_i; \
+				p10_len    = diagoffc_i_abs + panel_dim_i; \
+				diagoffc10 = diagoffc_i; \
+				p10        = p_begin; \
+				c10        = c_begin; \
+				c10        = c10 + diagoffc10 * ( doff_t )cs_c + \
+				                  -diagoffc10 * ( doff_t )rs_c;  \
+				incc10     = ldc; \
+				ldc10      = incc; \
+				conjc10    = conjc; \
+\
+				p12_dim    = panel_dim_i; \
+				p12_len    = panel_len - p10_len; \
+				j          = p10_len; \
+				p12        = p_begin + (j  )*ldp; \
+				c12        = c_begin + (j  )*ldc; \
+				incc12     = incc; \
+				ldc12      = ldc; \
+				conjc12    = conjc; \
+\
+				p11_m      = panel_dim_i; \
+				p11_n      = panel_dim_i; \
+				j          = diagoffc_i_abs; \
+				p11        = p_begin + (j  )*ldp; \
+				c11        = c_begin + (j  )*ldc; \
+\
+				if ( bli_is_hermitian( strucc ) ) \
+					bli_toggle_conj( conjc10 ); \
 			} \
-			else*/ if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
+/*
+			else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
 			{ \
 				p10_dim    = panel_dim_i; \
 				p10_len    = diagoffc_i + panel_dim_i; \
@@ -279,7 +344,7 @@ void PASTEMAC(ch,varname )( \
 				p12_len    = panel_len - p10_len; \
 				j          = p10_len; \
 				p12        = p_begin + (j  )*ldp; \
-				c12        = c_begin + (j  )*cs_c; \
+				c12        = c_begin + (j  )*ldc; \
 				incc12     = incc; \
 				ldc12      = ldc; \
 				conjc12    = conjc; \
@@ -288,12 +353,12 @@ void PASTEMAC(ch,varname )( \
 				p11_n      = panel_dim_i; \
 				j          = diagoffc_i; \
 				p11        = p_begin + (j  )*ldp; \
-				c11        = c_begin + (j  )*cs_c; \
+				c11        = c_begin + (j  )*ldc; \
 \
 				if ( bli_is_hermitian( strucc ) ) \
 					bli_toggle_conj( conjc10 ); \
 			} \
-			else /* if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
+			else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
 			{ \
 				p10_dim    = panel_dim_i; \
 				p10_len    = diagoffc_i; \
@@ -308,7 +373,7 @@ void PASTEMAC(ch,varname )( \
 				j          = p10_len; \
 				diagoffc12 = diagoffc_i - j; \
 				p12        = p_begin + (j  )*ldp; \
-				c12        = c_begin + (j  )*cs_c; \
+				c12        = c_begin + (j  )*ldc; \
 				c12        = c12 + diagoffc12 * ( doff_t )cs_c + \
 				                  -diagoffc12 * ( doff_t )rs_c;  \
 				incc12     = ldc; \
@@ -319,11 +384,12 @@ void PASTEMAC(ch,varname )( \
 				p11_n      = panel_dim_i; \
 				j          = diagoffc_i; \
 				p11        = p_begin + (j  )*ldp; \
-				c11        = c_begin + (j  )*cs_c; \
+				c11        = c_begin + (j  )*ldc; \
 \
 				if ( bli_is_hermitian( strucc ) ) \
 					bli_toggle_conj( conjc12 ); \
 			} \
+*/ \
 \
 			/* Pack to P10. For upper storage, this includes the unstored
 			   triangle of C11. */ \
@@ -351,11 +417,16 @@ void PASTEMAC(ch,varname )( \
 			                            p11_m, \
 			                            p11_n, \
 			                            beta_cast, \
-			                            c11, rs_c, cs_c, \
-			                            p11, 1,    ldp ); \
+			                            c11, rs_c,   cs_c, \
+			                            p11, rs_p11, cs_p11 ); \
 		} \
 		else \
 		{ \
+			/* Note that the following code executes if the current panel either:
+			   - does not intersect the diagonal, or
+			   - does intersect the diagonal, BUT the matrix is general
+			   which means the entire current panel can be copied at once. */ \
+\
 			/* We use some c10-specific variables here because we might need
 			   to change them if the current panel is unstored. (The values
 			   below are used if the current panel is stored.) */ \
@@ -368,8 +439,9 @@ void PASTEMAC(ch,varname )( \
 			   adjustments so we refer to the data where it is actually
 			   stored, and so we take conjugation into account. (Note
 			   this implicitly assumes we are operating on a symmetric or
-			   Hermitian matrix.) */ \
-			if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len ) ) \
+			   Hermitian matrix, since a general matrix would not contain
+			   any unstored region.) */ \
+			if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \
 			{ \
 				c10 = c10 + diagoffc_i * ( doff_t )cs_c + \
 				           -diagoffc_i * ( doff_t )rs_c;  \
@@ -440,7 +512,7 @@ void PASTEMAC(ch,varname )( \
 		                      p_begin, 1, cs_p, "%4.1f", "" ); \
 		if ( cs_p == 1 ) \
 		PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
-		                      p_begin, panel_dim, 1, "%6.3f", "" ); \
+		                      p_begin, panel_dim, 1, "%8.5f", "" ); \
 */ \
 	} \
 }
--- a/frame/1m/packm/bli_packm_blk_var3.c
+++ b/frame/1m/packm/bli_packm_blk_var3.c
@@ -271,7 +271,7 @@ void PASTEMAC(ch,varname )( \
 		} \
 		else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) ) \
 		{ \
-			/* Only two of four cases implemented, since BLIS2 currently does
+			/* Only two of four cases implemented, since BLIS currently does
 			   not support triangular packing of matrix B. */ \
 			/*if      ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) )  \
 			{ \
--- a/frame/3/gemm/bli_gemm.c
+++ b/frame/3/gemm/bli_gemm.c
@@ -48,13 +48,12 @@ void bli_gemm( obj_t*  alpha,
 	gemm_t* cntl;
 	obj_t   alpha_local;
 	obj_t   beta_local;
-	num_t   dt_targ_a;
-	num_t   dt_targ_b;
-	num_t   dt_targ_c;
-	num_t   dt_exec;
+	obj_t   a_local;
+	obj_t   b_local;
+	obj_t   c_local;
 	num_t   dt_alpha;
 	num_t   dt_beta;
-	bool_t  pack_c = FALSE;
+	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -67,105 +66,56 @@ void bli_gemm( obj_t*  alpha,
 		return;
 	}

-	// Determine the target datatype of each matrix object.
-	bli_gemm_get_target_datatypes( a,
-	                               b,
-	                               c,
-	                               &dt_targ_a,
-	                               &dt_targ_b,
-	                               &dt_targ_c,
-	                               &pack_c );
-	
-	// Set the target datatypes for each matrix object.
-	bli_obj_set_target_datatype( dt_targ_a, *a );
-	bli_obj_set_target_datatype( dt_targ_b, *b );
-	bli_obj_set_target_datatype( dt_targ_c, *c );
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( *a, a_local );
+	bli_obj_alias_to( *b, b_local );
+	bli_obj_alias_to( *c, c_local );

-	// Determine the execution datatype. Generally speaking, the
-	// execution datatype is the real projection of the target datatype
-	// of c. This rule holds unless the target datatypes of a and b
-	// are both complex, in which case the execution datatype is also
-	// complex.
-	if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
-		dt_exec = dt_targ_c;
-	else
-		dt_exec = bli_datatype_proj_to_real( dt_targ_c );
-
-	// Embed the execution datatype in all matrix operands.
-	bli_obj_set_execution_datatype( dt_exec, *a );
-	bli_obj_set_execution_datatype( dt_exec, *b );
-	bli_obj_set_execution_datatype( dt_exec, *c );
-
-	// Note that the precisions of the target datatypes of a, b, and c
-	// match. The domains, however, are not necessarily the same. There
-	// are eight possible combinations of target domains:
-	//
-	//   case  input     target    exec    pack  notes  
-	//         domain    domain    domain  c?      
-	//         c+=a*b    c+=a*b     
-	//   (0)   r  r r    r  r r    r              
-	//   (1)   r  r c    r  r r    r             b demoted to real
-	//   (2)   r  c r    r  r r    r             a demoted to real
-	//   (3)   r  c c    c  c c    c       yes   a*b demoted to real
-	//   (4)   c  r r    r  r r    r       yes   copynzm used to update c
-	//   (5)   c  r c    c  r c    r       yes   transposed to induce (6)
-	//   (6)   c  c r    c  c r    r       ~     c and a treated as real
-	//   (7)   c  c c    c  c c    c              
-	//   ~ Must pack c if not column-stored (ie: row or general storage).
-	//
-	// There are two special cases: (5) and (6). Because the inner kernels
-	// assume column storage, it is easy to implement (6) since we can
-	// simply treat matrices c and a as real matrices with inflated m
-	// dimension and column stride and then proceed with a kernel for real
-	// computation. We cannot pull the same trick with case (5) because it
-	// would result in a mismatch in the k dimension. But we can transform
-	// case (5) into case (6) by transposing all arguments and swapping the
-	// a and b operands. Also, we will need to pack matrix c. That is what
-	// we do here.
-	if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C. (The effect of the transposition of A and B is negligible
+	// because those operands are always packed to contiguous memory.)
+	if ( bli_obj_is_row_stored( *c ) )
 	{
-		bli_obj_swap_pointers( a, b );
-		bli_swap_types( dt_targ_a, dt_targ_b );
-		bli_obj_toggle_trans( *c );
-		bli_obj_toggle_trans( *a );
-		bli_obj_toggle_trans( *b );
+		bli_obj_swap( a_local, b_local );
+
+		bli_obj_induce_trans( a_local );
+		bli_obj_induce_trans( b_local );
+		bli_obj_induce_trans( c_local );
 	}

-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix a. By inspecting the table above,
-	// this clearly works for cases (0) through (4), (6), and (7). It
-	// Also works for case (5) since it is transformed into case (6) by
-	// the above code.
-	dt_alpha = dt_targ_a;
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_gemm_set_targ_exec_datatypes( &a_local,
+	                                  &b_local,
+	                                  &c_local,
+	                                  &dt_alpha,
+	                                  &dt_beta,
+	                                  &pack_c );
+
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of c. Here's why: If c is real and beta is complex,
-	// there is no reason to keep beta_local in the complex domain since
-	// the complex part of beta*c will not be stored. If c is complex and
-	// beta is real then beta is harmlessly promoted to complex.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

-	// Choose the control tree based on whether it was determined we need
-	// to pack c.
-	//if ( pack_c ) gemm_cntl = gemm_cntl_packabc;
-	//else          gemm_cntl = gemm_cntl_packab;
+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+	// Choose the control tree.
 	cntl = gemm_cntl;
-	if ( pack_c ) bli_abort();

 	// Invoke the internal back-end.
 	bli_gemm_int( &alpha_local,
-	              a,
-	              b,
+	              &a_local,
+	              &b_local,
 	              &beta_local,
-	              c,
+	              &c_local,
 	              cntl );
 }

--- a/frame/3/gemm/bli_gemm_check.c
+++ b/frame/3/gemm/bli_gemm_check.c
@@ -75,8 +75,8 @@ void bli_gemm_basic_check( obj_t*  alpha,
 	// We don't enforce general structure in matrix A so we can use gemm to
 	// implement hemm/symm. Instead, we only check this from the front-end.

-	e_val = bli_check_general_object( b );
-	bli_check_error_code( e_val );
+	//e_val = bli_check_general_object( b );
+	//bli_check_error_code( e_val );

 	e_val = bli_check_general_object( c );
 	bli_check_error_code( e_val );
@@ -99,6 +99,8 @@ void bli_gemm_check( obj_t*  alpha,
 	e_val = bli_check_general_object( a );
 	bli_check_error_code( e_val );

+	e_val = bli_check_general_object( b );
+	bli_check_error_code( e_val );
 }

 void bli_gemm_int_check( obj_t*  alpha,
--- a/frame/3/gemm/bli_gemm_target.c
+++ b/frame/3/gemm/bli_gemm_target.c
@@ -34,6 +34,99 @@

 #include "blis.h"

+void bli_gemm_set_targ_exec_datatypes( obj_t*  a,
+                                       obj_t*  b,
+                                       obj_t*  c,
+                                       num_t*  dt_alpha,
+                                       num_t*  dt_beta,
+                                       bool_t* pack_c )
+{
+	num_t   dt_targ_a;
+	num_t   dt_targ_b;
+	num_t   dt_targ_c;
+	num_t   dt_exec;
+
+	// Determine the target datatype of each matrix object.
+	bli_gemm_get_target_datatypes( a,
+	                               b,
+	                               c,
+	                               &dt_targ_a,
+	                               &dt_targ_b,
+	                               &dt_targ_c,
+	                               pack_c );
+	
+	// Set the target datatypes for each matrix object.
+	bli_obj_set_target_datatype( dt_targ_a, *a );
+	bli_obj_set_target_datatype( dt_targ_b, *b );
+	bli_obj_set_target_datatype( dt_targ_c, *c );
+
+	// Determine the execution datatype. Generally speaking, the
+	// execution datatype is the real projection of the target datatype
+	// of c. This rule holds unless the target datatypes of a and b
+	// are both complex, in which case the execution datatype is also
+	// complex.
+	if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
+		dt_exec = dt_targ_c;
+	else
+		dt_exec = bli_datatype_proj_to_real( dt_targ_c );
+
+	// Embed the execution datatype in all matrix operands.
+	bli_obj_set_execution_datatype( dt_exec, *a );
+	bli_obj_set_execution_datatype( dt_exec, *b );
+	bli_obj_set_execution_datatype( dt_exec, *c );
+
+	// Note that the precisions of the target datatypes of a, b, and c
+	// match. The domains, however, are not necessarily the same. There
+	// are eight possible combinations of target domains:
+	//
+	//   case  input     target    exec    pack  notes  
+	//         domain    domain    domain  c?      
+	//         c+=a*b    c+=a*b     
+	//   (0)   r  r r    r  r r    r              
+	//   (1)   r  r c    r  r r    r             b demoted to real
+	//   (2)   r  c r    r  r r    r             a demoted to real
+	//   (3)   r  c c    c  c c    c       yes   a*b demoted to real
+	//   (4)   c  r r    r  r r    r       yes   copynzm used to update c
+	//   (5)   c  r c    c  r c    r       yes   transposed to induce (6)
+	//   (6)   c  c r    c  c r    r       ~     c and a treated as real
+	//   (7)   c  c c    c  c c    c              
+	//   ~ Must pack c if not column-stored (ie: row or general storage).
+	//
+	// There are two special cases: (5) and (6). Because the inner kernels
+	// assume column storage, it is easy to implement (6) since we can
+	// simply treat matrices c and a as real matrices with inflated m
+	// dimension and column stride and then proceed with a kernel for real
+	// computation. We cannot pull the same trick with case (5) because it
+	// would result in a mismatch in the k dimension. But we can transform
+	// case (5) into case (6) by transposing all arguments and swapping the
+	// a and b operands. Also, we will need to pack matrix c. That is what
+	// we do here.
+	if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
+	{
+		bli_obj_swap( *a, *b );
+		bli_swap_types( dt_targ_a, dt_targ_b );
+		bli_obj_toggle_trans( *c );
+		bli_obj_toggle_trans( *a );
+		bli_obj_toggle_trans( *b );
+	}
+
+	// Notice that we use the target datatype of matrix a. By inspecting
+	// the table above, this clearly works for cases (0) through (4), (6),
+	// and (7). It also works for case (5) since it is transformed into
+	// case (6) by the above code.
+	*dt_alpha = bli_obj_target_datatype( *a );
+
+	// Notice that we use the target datatype of matrix a. By inspecting
+	// the table above, this clearly works for cases (0) through (4), (6),
+	// and (7). It also works for case (5) since it is transformed into
+	// case (6) by the above code.
+	*dt_beta = bli_obj_datatype( *c );
+
+	// For now disable packing of C.
+	*pack_c = FALSE;
+}
+
+
 void bli_gemm_get_target_datatypes( obj_t*  a,
                                    obj_t*  b,
                                    obj_t*  c,
--- a/frame/3/gemm/bli_gemm_target.h
+++ b/frame/3/gemm/bli_gemm_target.h
@@ -32,6 +32,13 @@

 */

+void bli_gemm_set_targ_exec_datatypes( obj_t*  a,
+                                       obj_t*  b,
+                                       obj_t*  c,
+                                       num_t*  dt_alpha,
+                                       num_t*  dt_beta,
+                                       bool_t* pack_c );
+
 void bli_gemm_get_target_datatypes( obj_t*  a,
                                    obj_t*  b,
                                    obj_t*  c,
--- a/frame/3/hemm/bli_hemm.c
+++ b/frame/3/hemm/bli_hemm.c
@@ -52,11 +52,9 @@ void bli_hemm( side_t  side,
 	obj_t   a_local;
 	obj_t   b_local;
 	obj_t   c_local;
-	num_t   dt_targ_a;
-	num_t   dt_targ_b;
-	num_t   dt_targ_c;
 	num_t   dt_alpha;
 	num_t   dt_beta;
+	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -69,47 +67,52 @@ void bli_hemm( side_t  side,
 		return;
 	}

-	// Alias A and B in case we need to induce the right side case.
+	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( *a, a_local );
 	bli_obj_alias_to( *b, b_local );
 	bli_obj_alias_to( *c, c_local );

-	// For now, assume the storage datatypes are the desired target
-	// datatypes.
-	dt_targ_a = bli_obj_datatype( *a );
-	dt_targ_b = bli_obj_datatype( *b );
-	dt_targ_c = bli_obj_datatype( *c );
-
-	// We implement hemm in terms of gemm. But in order to do so we must make
-	// sure matrix A is on the correct side for our gemm kernel. We assume
-	// gemm is implemented with a block-panel kernel, thus, we will only
-	// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
-	// transposing the operation. Since A is Hermitian, we can mark it for
-	// conjugation instead of transpostion (though transposition should be
-	// correctly handled as well).
-	if ( bli_is_right( side ) )
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C.
+	if ( bli_obj_is_row_stored( *c ) )
 	{
+		bli_toggle_side( side );
 		bli_obj_toggle_conj( a_local );
-		bli_obj_toggle_trans( b_local );
-		bli_obj_toggle_trans( c_local );
+		bli_obj_induce_trans( b_local );
+		bli_obj_induce_trans( c_local );
 	}

-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix A.
-	dt_alpha = dt_targ_a;
+	// Swap A and B if multiplying A from the right so that "B" contains
+	// the Hermitian matrix.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( a_local, b_local );
+	}
+
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_gemm_set_targ_exec_datatypes( &a_local,
+	                                  &b_local,
+	                                  &c_local,
+	                                  &dt_alpha,
+	                                  &dt_beta,
+	                                  &pack_c );
+
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of C.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
 	// Choose the control tree.
 	cntl = hemm_cntl;

--- a/frame/3/hemm/bli_hemm_cntl.c
+++ b/frame/3/hemm/bli_hemm_cntl.c
@@ -118,7 +118,8 @@ void bli_hemm_cntl_init()
 	                           hemm_kr,
 	                           hemm_nr,
 	                           FALSE, // do NOT scale by alpha
-	                           FALSE, // already dense; densify not necessary
+	                           //FALSE, // already dense; densify not necessary
+	                           TRUE,  // densify (if needed)
 	                           FALSE, // do NOT invert diagonal
 	                           FALSE, // reverse iteration if upper?
 	                           FALSE, // reverse iteration if lower?
--- a/frame/3/her2k/bli_her2k.c
+++ b/frame/3/her2k/bli_her2k.c
@@ -51,15 +51,13 @@ void bli_her2k( obj_t*  alpha,
 	obj_t    alpha_conj_local;
 	obj_t    beta_local;
 	obj_t    c_local;
-	obj_t    ah;
-	obj_t    bh;
-	num_t    dt_targ_a;
-	num_t    dt_targ_b;
-	num_t    dt_targ_c;
-	num_t    dt_exec;
+	obj_t    a_local;
+	obj_t    bh_local;
+	obj_t    b_local;
+	obj_t    ah_local;
 	num_t    dt_alpha;
 	num_t    dt_beta;
-	//bool_t   pack_c = FALSE;
+	bool_t   pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -72,84 +70,76 @@ void bli_her2k( obj_t*  alpha,
 		return;
 	}

-	// Alias C so we can reset it as the root object (in case it is not
-	// already a root object).
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( *a, a_local );
+	bli_obj_alias_to( *b, b_local );
 	bli_obj_alias_to( *c, c_local );
 	bli_obj_set_as_root( c_local );

-	// Create objects to track A' and B' (for the second rank-k update).
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *b, bh );
+	// For her2k, the first and second right-hand "B" operands are simply B'
+	// and A'.
+	bli_obj_alias_to( *b, bh_local );
+	bli_obj_induce_trans( bh_local );
+	bli_obj_toggle_conj( bh_local );
+	bli_obj_alias_to( *a, ah_local );
+	bli_obj_induce_trans( ah_local );
+	bli_obj_toggle_conj( ah_local );

-	// Determine the target datatype of each matrix object.
-	//bli_her2k_get_target_datatypes( a,
-	//                                b,
-	//                                c,
-	//                                &dt_targ_a,
-	//                                &dt_targ_b,
-	//                                &dt_targ_c,
-	//                                &pack_c );
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C. (The effect of the transposition of A and A' is negligible
+	// because those operands are always packed to contiguous memory.)
+	if ( bli_obj_is_row_stored( c_local ) )
+	{
+		bli_obj_toggle_conj( a_local );
+		bli_obj_toggle_conj( bh_local );
+		bli_obj_toggle_conj( b_local );
+		bli_obj_toggle_conj( ah_local );

-	dt_targ_a = bli_obj_datatype( *a );
-	dt_targ_b = bli_obj_datatype( *b );
-	dt_targ_c = bli_obj_datatype( *c );
+		bli_obj_induce_trans( c_local );
+	}

-	// Set the target datatypes for each matrix object.
-	bli_obj_set_target_datatype( dt_targ_a, *a  );
-	bli_obj_set_target_datatype( dt_targ_b, *b );
-	bli_obj_set_target_datatype( dt_targ_c, *c  );
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_her2k_set_targ_exec_datatypes( &a_local,
+	                                   &bh_local,
+	                                   &b_local,
+	                                   &ah_local,
+	                                   &c_local,
+	                                   &dt_alpha,
+	                                   &dt_beta,
+	                                   &pack_c );

-	// Determine the execution datatype.
-	dt_exec = dt_targ_a;
-
-	// Embed the execution datatype in all matrix operands.
-	bli_obj_set_execution_datatype( dt_exec, *a );
-	bli_obj_set_execution_datatype( dt_exec, *b );
-	bli_obj_set_execution_datatype( dt_exec, *c );
-
-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix a. By inspecting the table above,
-	// this clearly works for cases (0) through (4), (6), and (7). It
-	// Also works for case (5) since it is transformed into case (6) by
-	// the above code.
-	dt_alpha = dt_targ_a;
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

 	// Create an object to hold a copy-cast of conj(alpha).
-	dt_alpha = dt_targ_b;
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_CONJUGATE,
 	                             alpha,
 	                             &alpha_conj_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of c. Here's why: If c is real and beta is complex,
-	// there is no reason to keep beta_local in the complex domain since
-	// the complex part of beta*c will not be stored. If c is complex and
-	// beta is real then beta is harmlessly promoted to complex.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

-	// Choose the control tree based on whether it was determined we need
-	// to pack c.
-	//if ( pack_c ) her2k_cntl = her2k_cntl_packabc;
-	//else          her2k_cntl = her2k_cntl_packab;
+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+	// Choose the control tree.
 	cntl = her2k_cntl;
-	//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

 	// Invoke the internal back-end.
 	bli_her2k_int( &alpha_local,
-	               a,
-	               &bh,
+	               &a_local,
+	               &bh_local,
 	               &alpha_conj_local,
-	               b,
-	               &ah,
+	               &b_local,
+	               &ah_local,
 	               &beta_local,
 	               &c_local,
 	               cntl );
--- a/frame/3/her2k/bli_her2k.h
+++ b/frame/3/her2k/bli_her2k.h
@@ -35,6 +35,7 @@
 #include "bli_her2k_cntl.h"
 #include "bli_her2k_check.h"
 #include "bli_her2k_int.h"
+#include "bli_her2k_target.h"

 #include "bli_her2k_l_blk_var1.h"
 #include "bli_her2k_u_blk_var1.h"
--- a/frame/3/her2k/bli_her2k_target.c
+++ b/frame/3/her2k/bli_her2k_target.c
@@ -0,0 +1,99 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_her2k_set_targ_exec_datatypes( obj_t*  a,
+                                        obj_t*  bh,
+                                        obj_t*  b,
+                                        obj_t*  ah,
+                                        obj_t*  c,
+                                        num_t*  dt_alpha,
+                                        num_t*  dt_beta,
+                                        bool_t* pack_c )
+{
+	num_t   dt_targ_a;
+	num_t   dt_targ_bh;
+	num_t   dt_targ_b;
+	num_t   dt_targ_ah;
+	num_t   dt_targ_c;
+	num_t   dt_exec;
+
+	// Determine the target datatype of each matrix object.
+/*
+	bli_gemm_get_target_datatypes( a,
+	                               bh,
+	                               c,
+	                               &dt_targ_a,
+	                               &dt_targ_bh,
+	                               &dt_targ_c,
+	                               pack_c );
+*/
+	dt_targ_a  = bli_obj_datatype( *a );
+	dt_targ_bh = bli_obj_datatype( *bh );
+	dt_targ_b  = bli_obj_datatype( *b );
+	dt_targ_ah = bli_obj_datatype( *ah );
+	dt_targ_c  = bli_obj_datatype( *c );
+	dt_exec    = dt_targ_a;
+	
+	// Set the target datatypes for each matrix object.
+	bli_obj_set_target_datatype( dt_targ_a,  *a );
+	bli_obj_set_target_datatype( dt_targ_bh, *bh );
+	bli_obj_set_target_datatype( dt_targ_b,  *b );
+	bli_obj_set_target_datatype( dt_targ_ah, *ah );
+	bli_obj_set_target_datatype( dt_targ_c,  *c );
+
+	// Embed the execution datatype in all matrix operands.
+	bli_obj_set_execution_datatype( dt_exec, *a );
+	bli_obj_set_execution_datatype( dt_exec, *bh );
+	bli_obj_set_execution_datatype( dt_exec, *b );
+	bli_obj_set_execution_datatype( dt_exec, *ah );
+	bli_obj_set_execution_datatype( dt_exec, *c );
+
+	// Notice that we use the target datatype of matrix a. By inspecting
+	// the table above, this clearly works for cases (0) through (4), (6),
+	// and (7). It also works for case (5) since it is transformed into
+	// case (6) by the above code.
+	*dt_alpha = bli_obj_target_datatype( *a );
+
+	// Notice that we use the target datatype of matrix a. By inspecting
+	// the table above, this clearly works for cases (0) through (4), (6),
+	// and (7). It also works for case (5) since it is transformed into
+	// case (6) by the above code.
+	*dt_beta = bli_obj_datatype( *c );
+
+	// For now disable packing of C.
+	*pack_c = FALSE;
+}
+
--- a/frame/3/her2k/bli_her2k_target.h
+++ b/frame/3/her2k/bli_her2k_target.h
@@ -0,0 +1,43 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_her2k_set_targ_exec_datatypes( obj_t*  a,
+                                        obj_t*  bh,
+                                        obj_t*  b,
+                                        obj_t*  ah,
+                                        obj_t*  c,
+                                        num_t*  dt_alpha,
+                                        num_t*  dt_beta,
+                                        bool_t* pack_c );
+
--- a/frame/3/herk/bli_herk.c
+++ b/frame/3/herk/bli_herk.c
@@ -47,15 +47,12 @@ void bli_herk( obj_t*  alpha,
 	herk_t* cntl;
 	obj_t   alpha_local;
 	obj_t   beta_local;
+	obj_t   a_local;
+	obj_t   ah_local;
 	obj_t   c_local;
-	obj_t   ah;
-	num_t   dt_targ_a;
-	num_t   dt_targ_ah;
-	num_t   dt_targ_c;
-	num_t   dt_exec;
 	num_t   dt_alpha;
 	num_t   dt_beta;
-	//bool_t  pack_c = FALSE;
+	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -68,84 +65,58 @@ void bli_herk( obj_t*  alpha,
 		return;
 	}

-	// Alias C so we can reset it as the root object (in case it is not
-	// already a root object).
+	// Alias A and C in case we need to apply transformations.
+	bli_obj_alias_to( *a, a_local );
 	bli_obj_alias_to( *c, c_local );
 	bli_obj_set_as_root( c_local );

 	// For herk, the right-hand "B" operand is simply A'.
-	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
+	bli_obj_alias_to( *a, ah_local );
+	bli_obj_induce_trans( ah_local );
+	bli_obj_toggle_conj( ah_local );

-	// Determine the target datatype of each matrix object.
-	//bli_herk_get_target_datatypes( a,
-	//                               c,
-	//                               &dt_targ_a,
-	//                               &dt_targ_c,
-	//                               &pack_c );
-	
-	dt_targ_a  = bli_obj_datatype( *a );
-	dt_targ_ah = bli_obj_datatype( *a );
-	dt_targ_c  = bli_obj_datatype( *c );
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C. (The effect of the transposition of A and A' is negligible
+	// because those operands are always packed to contiguous memory.)
+	if ( bli_obj_is_row_stored( c_local ) )
+	{
+		bli_obj_toggle_conj( a_local );
+		bli_obj_toggle_conj( ah_local );

-	// Set the target datatypes for each matrix object.
-	bli_obj_set_target_datatype( dt_targ_a,  *a );
-	bli_obj_set_target_datatype( dt_targ_ah, ah );
-	bli_obj_set_target_datatype( dt_targ_c,  *c );
+		bli_obj_induce_trans( c_local );
+	}

-	// Determine the execution datatype. For herk, the execution
-	// datatype is always the target datatype of a.
-	dt_exec = dt_targ_a;
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_herk_set_targ_exec_datatypes( &a_local,
+	                                  &ah_local,
+	                                  &c_local,
+	                                  &dt_alpha,
+	                                  &dt_beta,
+	                                  &pack_c );

-	// Embed the execution datatype in all matrix operands.
-	bli_obj_set_execution_datatype( dt_exec, *a );
-	bli_obj_set_execution_datatype( dt_exec, ah );
-	bli_obj_set_execution_datatype( dt_exec, *c );
-
-	// Note that the precisions of the target datatypes of a and c
-	// match. The domains, however, are not necessarily the same. There
-	// are four possible combinations of target domains:
-	//
-	//   case  input     target    exec    pack  notes  
-    //         domain    domain    domain  c?      
-	//         c+=a*a'   c+=a*a'     
-	//   (0)   r  r r    r  r r    r              
-	//   (1)   r  c c    c  c c    c       yes   a*a' demoted to real
-	//   (2)   c  r r    r  r r    r       yes   copynzm used to update c
-	//   (3)   c  c c    c  c c    c              
-
-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix a. By inspecting the table above,
-	// this clearly works for cases (0) through (4), (6), and (7). It
-	// Also works for case (5) since it is transformed into case (6) by
-	// the above code.
-	dt_alpha = dt_targ_a;
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of c. Here's why: If c is real and beta is complex,
-	// there is no reason to keep beta_local in the complex domain since
-	// the complex part of beta*c will not be stored. If c is complex and
-	// beta is real then beta is harmlessly promoted to complex.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

-	// Choose the control tree based on whether it was determined we need
-	// to pack c.
-	//if ( pack_c ) herk_cntl = herk_cntl_packabc;
-	//else          herk_cntl = herk_cntl_packab;
+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+	// Choose the control tree.
 	cntl = herk_cntl;
-	//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

 	// Invoke the internal back-end.
 	bli_herk_int( &alpha_local,
-	              a,
-	              &ah,
+	              &a_local,
+	              &ah_local,
 	              &beta_local,
 	              &c_local,
 	              cntl );
--- a/frame/3/herk/bli_herk.h
+++ b/frame/3/herk/bli_herk.h
@@ -35,6 +35,7 @@
 #include "bli_herk_cntl.h"
 #include "bli_herk_check.h"
 #include "bli_herk_int.h"
+#include "bli_herk_target.h"

 #include "bli_herk_l_blk_var1.h"
 #include "bli_herk_u_blk_var1.h"
--- a/frame/3/herk/bli_herk_target.c
+++ b/frame/3/herk/bli_herk_target.c
@@ -0,0 +1,203 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_herk_set_targ_exec_datatypes( obj_t*  a,
+                                       obj_t*  ah,
+                                       obj_t*  c,
+                                       num_t*  dt_alpha,
+                                       num_t*  dt_beta,
+                                       bool_t* pack_c )
+{
+	num_t   dt_targ_a;
+	num_t   dt_targ_ah;
+	num_t   dt_targ_c;
+	num_t   dt_exec;
+
+	// Determine the target datatype of each matrix object.
+/*
+	bli_herk_get_target_datatypes( a,
+	                               c,
+	                               &dt_targ_a,
+	                               &dt_targ_c,
+	                               pack_c );
+*/
+	dt_targ_a  = bli_obj_datatype( *a );
+	dt_targ_ah = bli_obj_datatype( *ah );
+	dt_targ_c  = bli_obj_datatype( *c );
+	dt_exec    = dt_targ_a;
+	
+	// Set the target datatypes for each matrix object.
+	bli_obj_set_target_datatype( dt_targ_a,  *a );
+	bli_obj_set_target_datatype( dt_targ_ah, *ah );
+	bli_obj_set_target_datatype( dt_targ_c,  *c );
+
+	// Embed the execution datatype in all matrix operands.
+	bli_obj_set_execution_datatype( dt_exec, *a );
+	bli_obj_set_execution_datatype( dt_exec, *ah );
+	bli_obj_set_execution_datatype( dt_exec, *c );
+
+	*dt_alpha = bli_obj_target_datatype( *a );
+
+	*dt_beta = bli_obj_datatype( *c );
+
+	// For now disable packing of C.
+	*pack_c = FALSE;
+}
+
+/*
+void bli_herk_get_target_datatypes( obj_t*  a,
+                                    obj_t*  c,
+                                    num_t*  dt_a,
+                                    num_t*  dt_c,
+                                    bool_t* pack_c )
+{
+	prec_t tp_a, tp_c;
+	dom_t  td_a, td_c;
+
+	// Determine the target domains for each object.
+	bli_herk_get_target_domain( a,
+	                            c,
+	                            &td_a,
+	                            &td_c,
+	                            pack_c );
+
+	// Determine the target precisions for each object.
+	bli_herk_get_target_prec( a,
+	                          c,
+	                          &tp_a,
+	                          &tp_c,
+	                          pack_c );
+
+	// The target datatype of an object is simply the union of its
+	// target domain and target precision.
+	*dt_a = td_a | tp_a;
+	*dt_c = td_c | tp_c;
+}
+*/
+/*
+void bli_herk_get_target_domain( obj_t*  a,
+                                 obj_t*  c,
+                                 dom_t*  td_a,
+                                 dom_t*  td_c,
+                                 bool_t* pack_c )
+{
+	dom_t d_a = bli_obj_domain( *a );
+	dom_t d_c = bli_obj_domain( *c );
+
+	// Note that the precisions of the target datatypes of a and c
+	// match. The domains, however, are not necessarily the same. There
+	// are four possible combinations of target domains:
+	//
+	//   case  input     target    exec    pack  notes  
+	//         domain    domain    domain  c?      
+	//         c+=a*a'   c+=a*a'     
+	//   (0)   r  r r    r  r r    r              
+	//   (1)   r  c c    c  c c    c       yes   a*a' demoted to real
+	//   (2)   c  r r    r  r r    r       yes   copynzm used to update c
+	//   (3)   c  c c    c  c c    c
+
+	if ( bli_is_real( d_c ) )
+	{
+		if ( bli_is_complex( d_a ) )
+		{
+			*td_c = *td_a = BLIS_COMPLEX;
+			*pack_c = TRUE;
+		}
+		else
+		{
+			*td_c = *td_a = BLIS_REAL;
+		}
+	}
+	else // if ( bli_is_complex( d_c ) )
+	{
+		*td_a = d_a;
+
+		if ( bli_is_real( d_a ) )
+		{
+			*td_c = BLIS_REAL;
+			*pack_c = TRUE;
+		}
+		else
+		{
+			*td_c = BLIS_COMPLEX;
+
+			if ( bli_obj_is_real( *a ) ) *pack_c = TRUE;
+
+			if ( bli_obj_is_complex( *a ) )
+			{
+				if ( !bli_obj_is_col_stored( *c ) ) *pack_c = TRUE;
+			}
+		}
+	}
+}
+*/
+
+/*
+void bli_herk_get_target_prec( obj_t*  a,
+                               obj_t*  c,
+                               prec_t* tp_a,
+                               prec_t* tp_c,
+                               bool_t* pack_c )
+{
+	prec_t p_a = bli_obj_precision( *a );
+	prec_t p_c = bli_obj_precision( *c );
+
+	if ( bli_is_single_prec( p_c ) )
+	{
+		if ( bli_is_double_prec( p_a ) )
+		{
+			*tp_c = *tp_a = BLIS_DOUBLE_PREC;
+			*pack_c = TRUE;
+		}
+		else
+		{
+			*tp_c = *tp_a = BLIS_SINGLE_PREC;
+		}
+	}
+	else // if ( bli_is_double_prec( p_c ) )
+	{
+		if ( bli_is_single_prec( p_a ) )
+		{
+			*tp_c = *tp_a = BLIS_SINGLE_PREC;
+			*pack_c = TRUE;
+		}
+		else
+		{
+			*tp_c = *tp_a = BLIS_DOUBLE_PREC;
+		}
+	}
+}
+*/
--- a/frame/3/herk/bli_herk_target.h
+++ b/frame/3/herk/bli_herk_target.h
@@ -0,0 +1,66 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_herk_set_targ_exec_datatypes( obj_t*  a,
+                                       obj_t*  ah,
+                                       obj_t*  c,
+                                       num_t*  dt_alpha,
+                                       num_t*  dt_beta,
+                                       bool_t* pack_c );
+
+/*
+void bli_herk_get_target_datatypes( obj_t*  a,
+                                    obj_t*  b,
+                                    obj_t*  c,
+                                    num_t*  dt_a,
+                                    num_t*  dt_b,
+                                    num_t*  dt_c,
+                                    bool_t* pack_c );
+
+void bli_herk_get_target_domain( obj_t*  a,
+                                 obj_t*  b,
+                                 obj_t*  c,
+                                 dom_t*  td_a,
+                                 dom_t*  td_b,
+                                 dom_t*  td_c,
+                                 bool_t* pack_c );
+
+void bli_herk_get_target_prec( obj_t*  a,
+                               obj_t*  b,
+                               obj_t*  c,
+                               prec_t* tp_a,
+                               prec_t* tp_b,
+                               prec_t* tp_c,
+                               bool_t* pack_c );
+*/
--- a/frame/3/symm/bli_symm.c
+++ b/frame/3/symm/bli_symm.c
@@ -52,11 +52,9 @@ void bli_symm( side_t  side,
 	obj_t   a_local;
 	obj_t   b_local;
 	obj_t   c_local;
-	num_t   dt_targ_a;
-	num_t   dt_targ_b;
-	num_t   dt_targ_c;
 	num_t   dt_alpha;
 	num_t   dt_beta;
+	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -69,45 +67,51 @@ void bli_symm( side_t  side,
 		return;
 	}

-	// For now, assume the storage datatypes are the desired target
-	// datatypes.
-	dt_targ_a = bli_obj_datatype( *a );
-	dt_targ_b = bli_obj_datatype( *b );
-	dt_targ_c = bli_obj_datatype( *c );
-
-	// Alias A and B in case we need to induce the right side case.
+	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( *a, a_local );
 	bli_obj_alias_to( *b, b_local );
 	bli_obj_alias_to( *c, c_local );

-	// We implement symm in terms of gemm. But in order to do so we must make
-	// sure matrix A is on the correct side for our gemm kernel. We assume
-	// gemm is implemented with a block-panel kernel, thus, we will only
-	// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
-	// transposing the operation. Since A is symmetric, we do not mark it
-	// for any conjugation or transposition.
-	if ( bli_is_right( side ) )
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C.
+	if ( bli_obj_is_row_stored( *c ) )
 	{
-		bli_obj_toggle_trans( b_local );
-		bli_obj_toggle_trans( c_local );
+		bli_toggle_side( side );
+		bli_obj_induce_trans( b_local );
+		bli_obj_induce_trans( c_local );
 	}

-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix A.
-	dt_alpha = dt_targ_a;
+	// Swap A and B if multiplying A from the right so that "B" contains
+	// the symmetric matrix.
+	if ( bli_is_right( side ) )
+	{
+		bli_obj_swap( a_local, b_local );
+	}
+
+    // Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_gemm_set_targ_exec_datatypes( &a_local,
+	                                  &b_local,
+	                                  &c_local,
+	                                  &dt_alpha,
+	                                  &dt_beta,
+	                                  &pack_c );
+
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of C.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
 	// Choose the control tree. We can just use hemm since the algorithm
 	// is nearly identical to that of symm.
 	cntl = hemm_cntl;
--- a/frame/3/syr2k/bli_syr2k.c
+++ b/frame/3/syr2k/bli_syr2k.c
@@ -50,15 +50,13 @@ void bli_syr2k( obj_t*  alpha,
 	obj_t    alpha_local;
 	obj_t    beta_local;
 	obj_t    c_local;
-	obj_t    at;
-	obj_t    bt;
-	num_t    dt_targ_a;
-	num_t    dt_targ_b;
-	num_t    dt_targ_c;
-	num_t    dt_exec;
+	obj_t    a_local;
+	obj_t    bt_local;
+	obj_t    b_local;
+	obj_t    at_local;
 	num_t    dt_alpha;
 	num_t    dt_beta;
-	//bool_t   pack_c = FALSE;
+	bool_t   pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -71,69 +69,52 @@ void bli_syr2k( obj_t*  alpha,
 		return;
 	}

-	// Alias C so we can reset it as the root object (in case it is not
-	// already a root object).
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( *a, a_local );
+	bli_obj_alias_to( *b, b_local );
 	bli_obj_alias_to( *c, c_local );
 	bli_obj_set_as_root( c_local );

-	// Create objects to track A^T and B^T (for the second rank-k update).
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, *b, bt );
+	// For syr2k, the first and second right-hand "B" operands are simply B'
+	// and A'.
+	bli_obj_alias_to( *b, bt_local );
+	bli_obj_induce_trans( bt_local );
+	bli_obj_alias_to( *a, at_local );
+	bli_obj_induce_trans( at_local );

-	// Determine the target datatype of each matrix object.
-	//bli_syr2k_get_target_datatypes( a,
-	//                                b,
-	//                                c,
-	//                                &dt_targ_a,
-	//                                &dt_targ_b,
-	//                                &dt_targ_c,
-	//                                &pack_c );
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C. (The effect of the transposition of A and A' is negligible
+	// because those operands are always packed to contiguous memory.)
+	if ( bli_obj_is_row_stored( c_local ) )
+	{
+		bli_obj_induce_trans( c_local );
+	}

-	dt_targ_a = bli_obj_datatype( *a );
-	dt_targ_b = bli_obj_datatype( *b );
-	dt_targ_c = bli_obj_datatype( *c );
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_her2k_set_targ_exec_datatypes( &a_local,
+	                                   &bt_local,
+	                                   &b_local,
+	                                   &at_local,
+	                                   &c_local,
+	                                   &dt_alpha,
+	                                   &dt_beta,
+	                                   &pack_c );

-	// Set the target datatypes for each matrix object.
-	bli_obj_set_target_datatype( dt_targ_a, *a );
-	bli_obj_set_target_datatype( dt_targ_b, *b );
-	bli_obj_set_target_datatype( dt_targ_c, *c );
-
-	// Determine the execution datatype.
-	dt_exec = dt_targ_a;
-
-	// Embed the execution datatype in all matrix operands.
-	bli_obj_set_execution_datatype( dt_exec, *a );
-	bli_obj_set_execution_datatype( dt_exec, *b );
-	bli_obj_set_execution_datatype( dt_exec, *c );
-
-
-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix a. By inspecting the table above,
-	// this clearly works for cases (0) through (4), (6), and (7). It
-	// Also works for case (5) since it is transformed into case (6) by
-	// the above code.
-	dt_alpha = dt_targ_a;
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of c. Here's why: If c is real and beta is complex,
-	// there is no reason to keep beta_local in the complex domain since
-	// the complex part of beta*c will not be stored. If c is complex and
-	// beta is real then beta is harmlessly promoted to complex.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

-	// Choose the control tree based on whether it was determined we need
-	// to pack c.
-	//if ( pack_c ) syr2k_cntl = her2k_cntl_packabc;
-	//else          syr2k_cntl = her2k_cntl_packab;
-	//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

 	// Choose the control tree. We can just use her2k since the algorithm
 	// is nearly identical to that of syr2k.
@@ -141,11 +122,11 @@ void bli_syr2k( obj_t*  alpha,

 	// Invoke the internal back-end.
 	bli_her2k_int( &alpha_local,
-	               a,
-	               &bt,
+	               &a_local,
+	               &bt_local,
 	               &alpha_local,
-	               b,
-	               &at,
+	               &b_local,
+	               &at_local,
 	               &beta_local,
 	               &c_local,
 	               cntl );
--- a/frame/3/syrk/bli_syrk.c
+++ b/frame/3/syrk/bli_syrk.c
@@ -47,15 +47,12 @@ void bli_syrk( obj_t*  alpha,
 	herk_t* cntl;
 	obj_t   alpha_local;
 	obj_t   beta_local;
+	obj_t   a_local;
+	obj_t   at_local;
 	obj_t   c_local;
-	obj_t   at;
-	num_t   dt_targ_a;
-	num_t   dt_targ_at;
-	num_t   dt_targ_c;
-	num_t   dt_exec;
 	num_t   dt_alpha;
 	num_t   dt_beta;
-	//bool_t  pack_c = FALSE;
+	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -68,78 +65,46 @@ void bli_syrk( obj_t*  alpha,
 		return;
 	}

-	// Alias C so we can reset it as the root object (in case it is not
-	// already a root object).
+	// Alias A and C in case we need to apply transformations.
+	bli_obj_alias_to( *a, a_local );
 	bli_obj_alias_to( *c, c_local );
 	bli_obj_set_as_root( c_local );

-	// For syrk, the right-hand "B" operand is simply A^T.
-	bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
+	// For herk, the right-hand "B" operand is simply A^T.
+	bli_obj_alias_to( *a, at_local );
+	bli_obj_induce_trans( at_local );

-	// Determine the target datatype of each matrix object.
-	//bli_syrk_get_target_datatypes( a,
-	//                               c,
-	//                               &dt_targ_a,
-	//                               &dt_targ_c,
-	//                               &pack_c );
-	
-	dt_targ_a  = bli_obj_datatype( *a );
-	dt_targ_at = bli_obj_datatype( *a );
-	dt_targ_c  = bli_obj_datatype( *c );
+	// An optimization: If C is row-stored, transpose the entire operation
+	// so as to allow the macro-kernel more favorable access patterns
+	// through C. (The effect of the transposition of A and A^T is negligible
+	// because those operands are always packed to contiguous memory.)
+	if ( bli_obj_is_row_stored( c_local ) )
+	{
+		bli_obj_induce_trans( c_local );
+	}

-	// Set the target datatypes for each matrix object.
-	bli_obj_set_target_datatype( dt_targ_a,  *a );
-	bli_obj_set_target_datatype( dt_targ_at, at );
-	bli_obj_set_target_datatype( dt_targ_c,  *c );
+	// Set the target and execution datatypes of the objects, and apply
+	// any transformations necessary to handle mixed domain computation.
+	bli_herk_set_targ_exec_datatypes( &a_local,
+	                                  &at_local,
+	                                  &c_local,
+	                                  &dt_alpha,
+	                                  &dt_beta,
+	                                  &pack_c );

-	// Determine the execution datatype. For syrk, the execution
-	// datatype is always the target datatype of a.
-	dt_exec = dt_targ_a;
-
-	// Embed the execution datatype in all matrix operands.
-	bli_obj_set_execution_datatype( dt_exec, *a );
-	bli_obj_set_execution_datatype( dt_exec, at );
-	bli_obj_set_execution_datatype( dt_exec, *c );
-
-	// Note that the precisions of the target datatypes of a and c
-	// match. The domains, however, are not necessarily the same. There
-	// are four possible combinations of target domains:
-	//
-	//   case  input     target    exec    pack  notes  
-	//         domain    domain    domain  c?      
-	//         c+=a*a'   c+=a*a'     
-	//   (0)   r  r r    r  r r    r              
-	//   (1)   r  c c    c  c c    c       yes   a*a^T demoted to real
-	//   (2)   c  r r    r  r r    r       yes   copynzm used to update c
-	//   (3)   c  c c    c  c c    c              
-
-	// Create an object to hold a copy-cast of alpha. Notice that we use
-	// the target datatype of matrix a. By inspecting the table above,
-	// this clearly works for cases (0) through (4), (6), and (7). It
-	// Also works for case (5) since it is transformed into case (6) by
-	// the above code.
-	dt_alpha = dt_targ_a;
+	// Create an object to hold a copy-cast of alpha.
 	bli_obj_init_scalar_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

-	// Create an object to hold a copy-cast of beta. Notice that we use
-	// the datatype of c. Here's why: If c is real and beta is complex,
-	// there is no reason to keep beta_local in the complex domain since
-	// the complex part of beta*c will not be stored. If c is complex and
-	// beta is real then beta is harmlessly promoted to complex.
-	dt_beta = bli_obj_datatype( *c );
+	// Create an object to hold a copy-cast of beta.
 	bli_obj_init_scalar_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );

-	// Choose the control tree based on whether it was determined we need
-	// to pack c.
-	//if ( pack_c ) syrk_cntl = herk_cntl_packabc;
-	//else          syrk_cntl = herk_cntl_packab;
-	//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

 	// Choose the control tree. We can just use herk since the algorithm
 	// is nearly identical to that of syrk.
@@ -147,8 +112,8 @@ void bli_syrk( obj_t*  alpha,

 	// Invoke the internal back-end.
 	bli_herk_int( &alpha_local,
-	              a,
-	              &at,
+	              &a_local,
+	              &at_local,
 	              &beta_local,
 	              &c_local,
 	              cntl );
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -894,6 +894,16 @@ bli_obj_width_stored( obj )
 	                                   : ( bli_obj_buffer_at_off( obj ) ) \
 	         )

+
+// Swap objects
+
+#define bli_obj_swap( a, b ) \
+{ \
+       obj_t t; \
+       t = b; b = a; a = t; \
+}
+
+
 // Swap object pointers

 #define bli_obj_swap_pointers( a, b ) \
@@ -902,6 +912,7 @@ bli_obj_width_stored( obj )
 	t = b; b = a; a = t; \
 }

+
 // If a transposition is needed, induce one: swap dimensions, increments
 // and offsets, and then clear the trans bit.

--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -127,13 +127,13 @@

 // min, max, abs

-#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
-#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) )
-#define bli_abs( a )    ( (a) < 0 ? -(a) : (a) )
+#define bli_min( a, b )  ( (a) < (b) ? (a) : (b) )
+#define bli_max( a, b )  ( (a) > (b) ? (a) : (b) )
+#define bli_abs( a )     ( (a) < 0 ? -(a) : (a) )

 // fmin, fmax, fabs

-#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
+#define bli_min( a, b )  ( (a) < (b) ? (a) : (b) )
 #define bli_fmin( a, b ) bli_min( a, b )
 #define bli_fmax( a, b ) bli_max( a, b )
 #define bli_fabs( a )    ( (a) < 0.0 ? -(a) : (a) )