Added new "attached" scalar representation.

Details: - Added infrastructure to support a new scalar representation, whereby every object contains an internal scalar that defaults to 1.0. This facilitates passing scalars around without having to house them in separate objects. These "attached" scalars are stored in the internal atom_t field of the obj_t struct, and are always stored to be the same datatype as the object to which they are attached. Level-3 variants no longer take scalar arguments, however, level-3 internal back-ends stll do; this is so that the calling function can perform subproblems such as C := C - alpha * A * B on-the-fly without needing to change either of the scalars attached to A or B. - Removed scalar argument from packm_int(). - Observe and apply attached scalars in scalm_int(), and removed scalar from interface of scalm_unb_var1(). - Renamed the following functions (and corresponding invocations): bli_obj_init_scalar_copy_of() -> bli_obj_scalar_init_detached_copy_of() bli_obj_init_scalar() -> bli_obj_scalar_init_detached() bli_obj_create_scalar_with_attached_buffer() -> bli_obj_create_1x1_with_attached_buffer() bli_obj_scalar_equals() -> bli_obj_equals() - Defined new functions: bli_obj_scalar_detach() bli_obj_scalar_attach() bli_obj_scalar_apply_scalar() bli_obj_scalar_reset() bli_obj_scalar_has_nonzero_imag() bli_obj_scalar_equals() - Placed all bli_obj_scalar_* functions in a new file, bli_obj_scalar.c. - Renamed the following macros: bli_obj_scalar_buffer() -> bli_obj_buffer_for_1x1() bli_obj_is_scalar() -> bli_obj_is_1x1() - Defined new macros to set and copy internal scalars between objects: bli_obj_set_internal_scalar() bli_obj_copy_internal_scalar() - In level-3 internal back-ends, added conditional blocks where alpha and beta are checked for non-unit-ness. Those values for alpha and beta are applied to the scalars attached to aliases of A/B/C, as appropriate, before being passed into the variant specified by the control tree. - In level-3 blocked variants, pass BLIS_ONE into subproblems instead of alpha and/or beta. - In level-3 macro-kernels, changed how scalars are obtained. Now, scalars attached to A and B are multiplied together to obtain alpha, while beta is obtained directly from C. - In level-3 front-ends, removed old function calls meant to provide future support for mixed domain/precision. These can be added back later once that functionality is given proper treatment. Also, removed the creating of copy-casts of alpha and beta since typecasting of scalars is now implicitly handled in the internal back-ends when alpha and beta are applied to the attached scalars.
2026-05-12 01:59:59 +00:00 · 2013-12-03 16:08:30 -06:00
parent 992de486d6
commit b444489f10
247 changed files with 1461 additions and 1645 deletions
--- a/frame/0/getsc/bli_getsc.c
+++ b/frame/0/getsc/bli_getsc.c
@@ -56,7 +56,7 @@ void bli_getsc( obj_t*  chi,
 	// If chi is a constant object, default to using the dcomplex
 	// value within since we don't know if the caller needs just the
 	// real or the real and imaginary parts.
-	void*     buf_chi  = bli_obj_scalar_buffer( dt_def, *chi );
+	void*     buf_chi  = bli_obj_buffer_for_1x1( dt_def, *chi );

 	FUNCPTR_T f;

--- a/frame/1/axpyv/bli_axpyv.c
+++ b/frame/1/axpyv/bli_axpyv.c
@@ -58,7 +58,7 @@ void PASTEMAC0(opname)( \
 	dt_x = bli_obj_datatype( *x ); \
 \
 	/* Create an object to hold a copy-cast of alpha. */ \
-	bli_obj_init_scalar_copy_of( dt_x, \
+	bli_obj_scalar_init_detached_copy_of( dt_x, \
 	                             BLIS_NO_CONJUGATE, \
 	                             alpha, \
 	                             &alpha_local ); \
--- a/frame/1/dotxv/bli_dotxv_unb_var1.c
+++ b/frame/1/dotxv/bli_dotxv_unb_var1.c
@@ -93,11 +93,11 @@ void bli_dotxv_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of x and y. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of rho.
 	dt_beta   = dt_rho;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/1/scal2v/bli_scal2v.c
+++ b/frame/1/scal2v/bli_scal2v.c
@@ -58,7 +58,7 @@ void PASTEMAC0(opname)( \
 	dt_x = bli_obj_datatype( *x ); \
 \
 	/* Create an object to hold a copy-cast of beta. */ \
-	bli_obj_init_scalar_copy_of( dt_x, \
+	bli_obj_scalar_init_detached_copy_of( dt_x, \
 	                             BLIS_NO_CONJUGATE, \
 	                             beta, \
 	                             &beta_local ); \
--- a/frame/1/scalv/bli_scalv.c
+++ b/frame/1/scalv/bli_scalv.c
@@ -57,7 +57,7 @@ void PASTEMAC0(opname)( \
 	dt_x = bli_obj_datatype( *x ); \
 \
 	/* Create an object to hold a copy-cast of beta. */ \
-	bli_obj_init_scalar_copy_of( dt_x, \
+	bli_obj_scalar_init_detached_copy_of( dt_x, \
 	                             BLIS_NO_CONJUGATE, \
 	                             beta, \
 	                             &beta_local ); \
--- a/frame/1/scalv/bli_scalv_int.c
+++ b/frame/1/scalv/bli_scalv_int.c
@@ -64,7 +64,7 @@ void bli_scalv_int( obj_t*   beta,
 	if ( bli_obj_has_zero_dim( *x ) ) return;

 	// Return early if the beta scalar equals one.
-	if ( bli_obj_scalar_equals( beta, &BLIS_ONE ) ) return;
+	if ( bli_obj_equals( beta, &BLIS_ONE ) ) return;

 	// Extract the variant number and implementation type.
 	n = cntl_var_num( cntl );
--- a/frame/1/setv/bli_setv.c
+++ b/frame/1/setv/bli_setv.c
@@ -58,7 +58,7 @@ void PASTEMAC0(opname)( \
 	dt_x = bli_obj_datatype( *x ); \
 \
 	/* Create an object to hold a copy-cast of beta. */ \
-	bli_obj_init_scalar_copy_of( dt_x, \
+	bli_obj_scalar_init_detached_copy_of( dt_x, \
 	                             BLIS_NO_CONJUGATE, \
 	                             beta, \
 	                             &beta_local ); \
--- a/frame/1d/axpyd/bli_axpyd.c
+++ b/frame/1d/axpyd/bli_axpyd.c
@@ -53,7 +53,7 @@ void bli_axpyd( obj_t* alpha,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
--- a/frame/1d/scal2d/bli_scal2d.c
+++ b/frame/1d/scal2d/bli_scal2d.c
@@ -53,7 +53,7 @@ void bli_scal2d( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
--- a/frame/1d/scald/bli_scald.c
+++ b/frame/1d/scald/bli_scald.c
@@ -52,7 +52,7 @@ void bli_scald( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
--- a/frame/1d/setd/bli_setd.c
+++ b/frame/1d/setd/bli_setd.c
@@ -52,7 +52,7 @@ void bli_setd( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
--- a/frame/1f/axpyf/bli_axpyf_unb_var1.c
+++ b/frame/1f/axpyf/bli_axpyf_unb_var1.c
@@ -93,7 +93,7 @@ void bli_axpyf_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c
+++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c
@@ -113,11 +113,11 @@ void bli_dotxaxpyf_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.c
+++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.c
@@ -113,11 +113,11 @@ void bli_dotxaxpyf_unb_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/1f/dotxf/bli_dotxf_unb_var1.c
+++ b/frame/1f/dotxf/bli_dotxf_unb_var1.c
@@ -98,11 +98,11 @@ void bli_dotxf_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/1m/axpym/bli_axpym.c
+++ b/frame/1m/axpym/bli_axpym.c
@@ -53,7 +53,7 @@ void bli_axpym( obj_t* alpha,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
--- a/frame/1m/packm/bli_packm_blk_var2.c
+++ b/frame/1m/packm/bli_packm_blk_var2.c
@@ -46,7 +46,7 @@ typedef void (*FUNCPTR_T)(
                           dim_t   n,
                           dim_t   m_max,
                           dim_t   n_max,
-                           void*   beta,
+                           void*   kappa,
                           void*   c, inc_t rs_c, inc_t cs_c,
                           void*   p, inc_t rs_p, inc_t cs_p,
                                      dim_t pd_p, inc_t ps_p
@@ -55,8 +55,7 @@ typedef void (*FUNCPTR_T)(
 static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);


-void bli_packm_blk_var2( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_blk_var2( obj_t*   c,
                         obj_t*   p )
 {
 	num_t     dt_cp     = bli_obj_datatype( *c );
@@ -82,10 +81,16 @@ void bli_packm_blk_var2( obj_t*   beta,
 	dim_t     pd_p      = bli_obj_panel_dim( *p );
 	inc_t     ps_p      = bli_obj_panel_stride( *p );

-	void*     buf_beta  = bli_obj_scalar_buffer( dt_cp, *beta );
+	void*     buf_kappa;

 	FUNCPTR_T f;

+	// This variant assumes that the micro-kernel will always apply the
+	// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
+	// for kappa so that the underlying packm implementation does not
+	// scale during packing.
+	buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
+
 	// Index into the type combination array to extract the correct
 	// function pointer.
 	f = ftypes[dt_cp];
@@ -100,7 +105,7 @@ void bli_packm_blk_var2( obj_t*   beta,
 	   n_p,
 	   m_max_p,
 	   n_max_p,
-	   buf_beta,
+	   buf_kappa,
 	   buf_c, rs_c, cs_c,
 	   buf_p, rs_p, cs_p,
 	          pd_p, ps_p );
@@ -120,16 +125,16 @@ void PASTEMAC(ch,varname )( \
                            dim_t   n, \
                            dim_t   m_max, \
                            dim_t   n_max, \
-                            void*   beta, \
+                            void*   kappa, \
                            void*   c, inc_t rs_c, inc_t cs_c, \
                            void*   p, inc_t rs_p, inc_t cs_p, \
                                       dim_t pd_p, inc_t ps_p  \
                          ) \
 { \
-	ctype* restrict beta_cast = beta; \
-	ctype* restrict c_cast    = c; \
-	ctype* restrict p_cast    = p; \
-	ctype* restrict zero      = PASTEMAC(ch,0); \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict c_begin; \
 	ctype* restrict p_begin; \
 \
@@ -338,7 +343,7 @@ void PASTEMAC(ch,varname )( \
 			PASTEMAC(ch,packm_cxk)( conjc10, \
 			                        p10_dim, \
 			                        p10_len, \
-			                        beta_cast, \
+			                        kappa_cast, \
 			                        c10, incc10, ldc10, \
 			                        p10,         ldp ); \
 \
@@ -347,7 +352,7 @@ void PASTEMAC(ch,varname )( \
 			PASTEMAC(ch,packm_cxk)( conjc12, \
 			                        p12_dim, \
 			                        p12_len, \
-			                        beta_cast, \
+			                        kappa_cast, \
 			                        c12, incc12, ldc12, \
 			                        p12,         ldp ); \
 \
@@ -358,7 +363,7 @@ void PASTEMAC(ch,varname )( \
 			                                     conjc, \
 			                                     p11_m, \
 			                                     p11_n, \
-			                                     beta_cast, \
+			                                     kappa_cast, \
 			                                     c11, rs_c,   cs_c, \
 			                                     p11, rs_p11, cs_p11 ); \
 \
@@ -412,7 +417,7 @@ void PASTEMAC(ch,varname )( \
 			PASTEMAC(ch,packm_cxk)( conjc10, \
 			                        panel_dim_i, \
 			                        panel_len, \
-			                        beta_cast, \
+			                        kappa_cast, \
 			                        c10,     incc10, ldc10, \
 			                        p_begin,         ldp ); \
 \
--- a/frame/1m/packm/bli_packm_blk_var2.h
+++ b/frame/1m/packm/bli_packm_blk_var2.h
@@ -32,8 +32,7 @@

 */

-void bli_packm_blk_var2( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_blk_var2( obj_t*   c,
                         obj_t*   p );


@@ -50,7 +49,7 @@ void PASTEMAC(ch,varname)( \
                           dim_t   n, \
                           dim_t   m_max, \
                           dim_t   n_max, \
-                           void*   beta, \
+                           void*   kappa, \
                           void*   c, inc_t rs_c, inc_t cs_c, \
                           void*   p, inc_t rs_p, inc_t cs_p, \
                                      dim_t pd_p, inc_t ps_p  \
--- a/frame/1m/packm/bli_packm_blk_var3.c
+++ b/frame/1m/packm/bli_packm_blk_var3.c
@@ -49,7 +49,7 @@ typedef void (*FUNCPTR_T)(
                           dim_t   n,
                           dim_t   m_max,
                           dim_t   n_max,
-                           void*   beta,
+                           void*   kappa,
                           void*   c, inc_t rs_c, inc_t cs_c,
                           void*   p, inc_t rs_p, inc_t cs_p,
                                      dim_t pd_p, inc_t ps_p
@@ -58,8 +58,7 @@ typedef void (*FUNCPTR_T)(
 static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);


-void bli_packm_blk_var3( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_blk_var3( obj_t*   c,
                         obj_t*   p )
 {
 	num_t     dt_cp     = bli_obj_datatype( *c );
@@ -88,10 +87,16 @@ void bli_packm_blk_var3( obj_t*   beta,
 	dim_t     pd_p      = bli_obj_panel_dim( *p );
 	inc_t     ps_p      = bli_obj_panel_stride( *p );

-	void*     buf_beta  = bli_obj_scalar_buffer( dt_cp, *beta );
+	void*     buf_kappa;

 	FUNCPTR_T f;

+	// This variant assumes that the micro-kernel will always apply the
+	// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
+	// for kappa so that the underlying packm implementation does not
+	// scale during packing.
+	buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
+
 	// Index into the type combination array to extract the correct
 	// function pointer.
 	f = ftypes[dt_cp];
@@ -109,7 +114,7 @@ void bli_packm_blk_var3( obj_t*   beta,
 	   n_p,
 	   m_max_p,
 	   n_max_p,
-	   buf_beta,
+	   buf_kappa,
 	   buf_c, rs_c, cs_c,
 	   buf_p, rs_p, cs_p,
 	          pd_p, ps_p );
@@ -132,16 +137,16 @@ void PASTEMAC(ch,varname )( \
                            dim_t   n, \
                            dim_t   m_max, \
                            dim_t   n_max, \
-                            void*   beta, \
+                            void*   kappa, \
                            void*   c, inc_t rs_c, inc_t cs_c, \
                            void*   p, inc_t rs_p, inc_t cs_p, \
                                       dim_t pd_p, inc_t ps_p  \
                          ) \
 { \
-	ctype* restrict beta_cast = beta; \
-	ctype* restrict c_cast    = c; \
-	ctype* restrict p_cast    = p; \
-	ctype* restrict zero      = PASTEMAC(ch,0); \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict c_begin; \
 	ctype* restrict p_begin; \
 \
@@ -317,7 +322,7 @@ void PASTEMAC(ch,varname )( \
 			PASTEMAC(ch,packm_cxk)( conjc, \
 			                        panel_dim_i, \
 			                        panel_len_i, \
-			                        beta_cast, \
+			                        kappa_cast, \
 			                        c_use, incc, ldc, \
 			                        p_use,       ldp ); \
 \
@@ -328,7 +333,7 @@ void PASTEMAC(ch,varname )( \
 				PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \
 				                                *m_panel_use, \
 				                                *n_panel_use, \
-				                                beta_cast, \
+				                                kappa_cast, \
 				                                p_use, rs_p, cs_p ); \
 			} \
 \
@@ -378,7 +383,7 @@ void PASTEMAC(ch,varname )( \
 			PASTEMAC(ch,packm_cxk)( conjc, \
 			                        panel_dim_i, \
 			                        panel_len_i, \
-			                        beta_cast, \
+			                        kappa_cast, \
 			                        c_use, incc, ldc, \
 			                        p_use,       ldp ); \
 \
--- a/frame/1m/packm/bli_packm_blk_var3.h
+++ b/frame/1m/packm/bli_packm_blk_var3.h
@@ -32,8 +32,7 @@

 */

-void bli_packm_blk_var3( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_blk_var3( obj_t*   c,
                         obj_t*   p );


@@ -53,7 +52,7 @@ void PASTEMAC(ch,varname)( \
                           dim_t   n, \
                           dim_t   m_max, \
                           dim_t   n_max, \
-                           void*   beta, \
+                           void*   kappa, \
                           void*   c, inc_t rs_c, inc_t cs_c, \
                           void*   p, inc_t rs_p, inc_t cs_p, \
                                      dim_t pd_p, inc_t ps_p \
--- a/frame/1m/packm/bli_packm_check.c
+++ b/frame/1m/packm/bli_packm_check.c
@@ -34,33 +34,49 @@

 #include "blis.h"

-void bli_packm_check( obj_t*   beta,
-                      obj_t*   c,
-                      obj_t*   p,
-                      packm_t* cntl )
+
+void bli_packm_init_check( obj_t*   a,
+                           obj_t*   p,
+                           packm_t* cntl )
 {
 	err_t e_val;

 	// Check object datatypes.

-	e_val = bli_check_noninteger_object( beta );
+	e_val = bli_check_floating_object( a );
 	bli_check_error_code( e_val );

-	e_val = bli_check_floating_object( c );
-	bli_check_error_code( e_val );
+	// Check control tree pointer.

-	// Check object dimensions.
-
-	e_val = bli_check_scalar_object( beta );
-	bli_check_error_code( e_val );
-
-	// We don't check for conformal dimensions between c and p because
-	// p has not yet been initialized.
-
-	// Check control tree pointer
-
-	// NOTE: We can't check the control tree until we stop interpreting a
-	// NULL value (in bli_packm_int()) as a request to skip the operation.
+	// NOTE: We can't check the control tree because we interpret a NULL
+	// value (in bli_packm_int()) as a request to skip the operation.
+	//e_val = bli_check_valid_cntl( ( void* )cntl );
+	//bli_check_error_code( e_val );
+}
+
+void bli_packm_int_check( obj_t*   a,
+                          obj_t*   p,
+                          packm_t* cntl )
+{
+	err_t e_val;
+
+	// Check object datatypes.
+
+	e_val = bli_check_floating_object( a );
+	bli_check_error_code( e_val );
+
+	e_val = bli_check_floating_object( p );
+	bli_check_error_code( e_val );
+
+	// Check object dimensions.
+
+	e_val = bli_check_conformal_dims( a, p );
+	bli_check_error_code( e_val );
+
+	// Check control tree pointer.
+
+	// NOTE: We can't check the control tree because we interpret a NULL
+	// value (in bli_packm_int()) as a request to skip the operation.
 	//e_val = bli_check_valid_cntl( ( void* )cntl );
 	//bli_check_error_code( e_val );
 }
--- a/frame/1m/packm/bli_packm_check.h
+++ b/frame/1m/packm/bli_packm_check.h
@@ -32,7 +32,10 @@

 */

-void bli_packm_check( obj_t*   beta,
-                      obj_t*   c,
-                      obj_t*   p,
-                      packm_t* cntl );
+void bli_packm_init_check( obj_t*   a,
+                           obj_t*   p,
+                           packm_t* cntl );
+
+void bli_packm_int_check( obj_t*   a,
+                          obj_t*   p,
+                          packm_t* cntl );
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -56,7 +56,7 @@ void bli_packm_init( obj_t*   a,

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bli_packm_check( &BLIS_ONE, a, p, cntl );
+		bli_packm_init_check( a, p, cntl );

 	// First check if we are to skip this operation because the control tree
 	// is NULL, and if so, simply alias the object to its packed counterpart.
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -36,8 +36,7 @@

 #define FUNCPTR_T packm_fp

-typedef void (*FUNCPTR_T)( obj_t*   beta,
-                           obj_t*   a,
+typedef void (*FUNCPTR_T)( obj_t*   a,
                           obj_t*   p );

 static FUNCPTR_T vars[6][3] =
@@ -51,20 +50,17 @@ static FUNCPTR_T vars[6][3] =
 	{ NULL,               NULL,                  NULL,              },
 };

-void bli_packm_int( obj_t*   beta,
-                    obj_t*   a,
+void bli_packm_int( obj_t*   a,
                    obj_t*   p,
                    packm_t* cntl )
 {
-	obj_t*    beta_use;
-
 	varnum_t  n;
 	impl_t    i;
 	FUNCPTR_T f;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
-		bli_packm_check( beta, a, p, cntl );
+		bli_packm_int_check( a, p, cntl );

 	// Sanity check; A should never have a zero dimension. If we must support
 	// it, then we should fold it into the next alias-and-early-exit block.
@@ -106,13 +102,35 @@ void bli_packm_int( obj_t*   beta,
 		return;
 	}

-	// Notice that a beta parameter is always passed in. This value is allowed
-	// to be non-unit even when no scaling is prescribed. If the control tree
-	// indicates no scaling, then make sure that BLIS_ONE is passed into the
-	// packm implementation.
-	//if ( cntl_does_scale( cntl ) ) beta_use = beta;
-	//else                           beta_use = &BLIS_ONE;
-	beta_use = &BLIS_ONE;
+/*
+	// The value for kappa we use will depend on whether the scalar
+	// attached to A has a nonzero imaginary component. If it does,
+	// then we will apply the scalar during packing to facilitate
+	// implementing complex domain micro-kernels in terms of their
+	// real domain counterparts. (In the aforementioned situation,
+	// applying a real scalar is easy, but applying a complex one is
+	// harder, so we avoid the need altogether with the code below.)
+	if ( bli_obj_scalar_has_nonzero_imag( a ) )
+	{
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
+
+		// Detach the scalar.
+		bli_obj_scalar_detach( a, &kappa );
+
+		// Reset the attached scalar (to 1.0).
+		bli_obj_scalar_reset( a );
+
+		kappa_p = &kappa;
+	}
+	else
+	{
+		// If the internal scalar of A has only a real component, then
+		// we will apply it later (in the micro-kernel), and so we will
+		// use BLIS_ONE to indicate no scaling during packing.
+		kappa_p = &BLIS_ONE;
+	}
+*/
+

 	// Extract the variant number and implementation type.
 	n = cntl_var_num( cntl );
@@ -121,9 +139,8 @@ void bli_packm_int( obj_t*   beta,
 	// Index into the variant array to extract the correct function pointer.
 	f = vars[n][i];

-	// Invoke the variant with beta_use.
-	f( beta_use,
-	   a,
+	// Invoke the variant with kappa_use.
+	f( a,
 	   p );
 }

--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -32,8 +32,7 @@

 */

-void bli_packm_int( obj_t*   beta,
-                    obj_t*   c,
+void bli_packm_int( obj_t*   a,
                    obj_t*   p,
                    packm_t* cntl );

--- a/frame/1m/packm/bli_packm_unb_var1.c
+++ b/frame/1m/packm/bli_packm_unb_var1.c
@@ -47,7 +47,7 @@ typedef void (*FUNCPTR_T)(
                           dim_t   n,
                           dim_t   m_max,
                           dim_t   n_max,
-                           void*   beta,
+                           void*   kappa,
                           void*   c, inc_t rs_c, inc_t cs_c,
                           void*   p, inc_t rs_p, inc_t cs_p
                         );
@@ -55,8 +55,7 @@ typedef void (*FUNCPTR_T)(
 static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);


-void bli_packm_unb_var1( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_unb_var1( obj_t*   c,
                         obj_t*   p )
 {
 	num_t     dt_cp     = bli_obj_datatype( *c );
@@ -81,7 +80,7 @@ void bli_packm_unb_var1( obj_t*   beta,
 	inc_t     rs_p      = bli_obj_row_stride( *p );
 	inc_t     cs_p      = bli_obj_col_stride( *p );

-	void*     buf_beta  = bli_obj_scalar_buffer( dt_cp, *beta );
+	void*     buf_kappa;

 	FUNCPTR_T f;

@@ -89,6 +88,12 @@ void bli_packm_unb_var1( obj_t*   beta,
 	if ( bli_obj_is_dense( *p ) ) densify = TRUE;
 	else                          densify = FALSE;

+	// This variant assumes that the computational kernel will always apply
+	// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
+	// for kappa so that the underlying packm implementation does not scale
+	// during packing.
+	buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
+
 	// Index into the type combination array to extract the correct
 	// function pointer.
 	f = ftypes[dt_cp];
@@ -104,7 +109,7 @@ void bli_packm_unb_var1( obj_t*   beta,
 	   n_p,
 	   m_max_p,
 	   n_max_p,
-	   buf_beta,
+	   buf_kappa,
 	   buf_c, rs_c, cs_c,
 	   buf_p, rs_p, cs_p );
 }
@@ -124,20 +129,20 @@ void PASTEMAC(ch,varname)( \
                           dim_t   n, \
                           dim_t   m_max, \
                           dim_t   n_max, \
-                           void*   beta, \
+                           void*   kappa, \
                           void*   c, inc_t rs_c, inc_t cs_c, \
                           void*   p, inc_t rs_p, inc_t cs_p \
                         ) \
 { \
-	ctype* restrict beta_cast = beta; \
-	ctype* restrict c_cast    = c; \
-	ctype* restrict p_cast    = p; \
-	ctype* restrict zero      = PASTEMAC(ch,0); \
+	ctype* restrict kappa_cast = kappa; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict p_cast     = p; \
+	ctype* restrict zero       = PASTEMAC(ch,0); \
 \
 	/* We begin by packing the region indicated by the parameters. If
 	   matrix c is dense (either because the structure is general or
 	   because the structure has already been "densified"), this ends
-	   up being the only action we take. Note that if beta is unit,
+	   up being the only action we take. Note that if kappa is unit,
 	   the data is simply copied (rather than scaled by one). */ \
 	PASTEMAC3(ch,ch,ch,scal2m)( diagoffc, \
 	                            diagc, \
@@ -145,7 +150,7 @@ void PASTEMAC(ch,varname)( \
 	                            transc, \
 	                            m, \
 	                            n, \
-	                            beta_cast, \
+	                            kappa_cast, \
 	                            c_cast, rs_c, cs_c, \
 	                            p_cast, rs_p, cs_p ); \
 \
@@ -184,7 +189,7 @@ void PASTEMAC(ch,varname)( \
 			                            transc, \
 			                            m, \
 			                            n, \
-			                            beta_cast, \
+			                            kappa_cast, \
 			                            c_cast, rs_c, cs_c, \
 			                            p_cast, rs_p, cs_p ); \
 		} \
--- a/frame/1m/packm/bli_packm_unb_var1.h
+++ b/frame/1m/packm/bli_packm_unb_var1.h
@@ -32,8 +32,7 @@

 */

-void bli_packm_unb_var1( obj_t*   beta,
-                         obj_t*   c,
+void bli_packm_unb_var1( obj_t*   c,
                         obj_t*   p );


@@ -51,7 +50,7 @@ void PASTEMAC(ch,varname)( \
                           dim_t   n, \
                           dim_t   m_max, \
                           dim_t   n_max, \
-                           void*   beta, \
+                           void*   kappa, \
                           void*   c, inc_t rs_c, inc_t cs_c, \
                           void*   p, inc_t rs_p, inc_t cs_p \
                         );
--- a/frame/1m/packm/old/bli_packm_blk_var1.c
+++ b/frame/1m/packm/old/bli_packm_blk_var1.c
@@ -83,7 +83,7 @@ void bli_packm_blk_var1( obj_t*   beta,
 	inc_t     cs_p      = bli_obj_col_stride( *p );
 	inc_t     ps_p      = bli_obj_panel_stride( *p );

-	void*     buf_beta  = bli_obj_scalar_buffer( dt_cp, *beta );
+	void*     buf_beta  = bli_obj_buffer_for_1x1( dt_cp, *beta );

 	FUNCPTR_T f;

--- a/frame/1m/packm/other/bli_packm_blk_var2.c
+++ b/frame/1m/packm/other/bli_packm_blk_var2.c
@@ -83,7 +83,7 @@ void bli_packm_blk_var2( obj_t*   beta,
 	dim_t     pd_p      = bli_obj_panel_dim( *p );
 	inc_t     ps_p      = bli_obj_panel_stride( *p );

-	void*     buf_beta  = bli_obj_scalar_buffer( dt_cp, *beta );
+	void*     buf_beta  = bli_obj_buffer_for_1x1( dt_cp, *beta );

 	FUNCPTR_T f;

--- a/frame/1m/scal2m/bli_scal2m.c
+++ b/frame/1m/scal2m/bli_scal2m.c
@@ -53,7 +53,7 @@ void bli_scal2m( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of beta.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
--- a/frame/1m/scalm/bli_scalm.c
+++ b/frame/1m/scalm/bli_scalm.c
@@ -43,28 +43,12 @@ extern scalm_t* scalm_cntl;
 void bli_scalm( obj_t* beta,
                obj_t* x )
 {
-	num_t dt_x;
-	obj_t beta_local;
-
 	if ( bli_error_checking_is_enabled() )
 		bli_scalm_check( beta, x );

-	// Use the datatype of x as the target type for beta (since we do
-	// not assume mixed domain/type support is enabled).
-	dt_x = bli_obj_datatype( *x );
-
-	// Create an object to hold a copy-cast of beta.
-	bli_obj_init_scalar_copy_of( dt_x,
-	                             BLIS_NO_CONJUGATE,
-	                             beta,
-	                             &beta_local );
-
-	bli_scalm_unb_var1( &beta_local, x );
-/*
-	bli_scalm_int( &beta_local,
+	bli_scalm_int( beta,
 	               x,
 	               scalm_cntl );
-*/
 }


--- a/frame/1m/scalm/bli_scalm_int.c
+++ b/frame/1m/scalm/bli_scalm_int.c
@@ -36,8 +36,7 @@

 #define FUNCPTR_T scalm_fp

-typedef void (*FUNCPTR_T)( obj_t* beta,
-                           obj_t* x );
+typedef void (*FUNCPTR_T)( obj_t* x );

 static FUNCPTR_T vars[1][3] =
 {
@@ -49,6 +48,7 @@ void bli_scalm_int( obj_t*   beta,
                    obj_t*   x,
                    scalm_t* cntl )
 {
+	obj_t     x_local;
 	varnum_t  n;
 	impl_t    i;
 	FUNCPTR_T f;
@@ -63,8 +63,18 @@ void bli_scalm_int( obj_t*   beta,
 	// Return early if one of the matrix operands has a zero dimension.
 	if ( bli_obj_has_zero_dim( *x ) ) return;

-	// Return early if the beta scalar equals one.
-	if ( bli_obj_scalar_equals( beta, &BLIS_ONE ) ) return;
+	// Return early if both beta and the scalar attached to x are unit.
+	if ( bli_obj_equals( beta, &BLIS_ONE ) &&
+	     bli_obj_scalar_equals( x, &BLIS_ONE ) ) return;
+
+	// Alias x to x_local so we can apply beta if it is non-unit.
+	bli_obj_alias_to( *x, x_local );
+
+	// If beta is non-unit, apply it to the scalar attached to x.
+	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
+	{
+		bli_obj_scalar_apply_scalar( beta, &x_local );
+	}

 	// Extract the variant number and implementation type.
 	n = cntl_var_num( cntl );
@@ -74,7 +84,6 @@ void bli_scalm_int( obj_t*   beta,
 	f = vars[n][i];

 	// Invoke the variant.
-	f( beta,
-	   x );
+	f( &x_local );
 }

--- a/frame/1m/scalm/bli_scalm_unb_var1.c
+++ b/frame/1m/scalm/bli_scalm_unb_var1.c
@@ -59,12 +59,10 @@ static FUNCPTR_T GENARRAY2_MIN(ftypes,scalm_unb_var1);
 #endif


-void bli_scalm_unb_var1( obj_t*  beta,
-                         obj_t*  x )
+void bli_scalm_unb_var1( obj_t*  x )
 {
 	num_t     dt_x      = bli_obj_datatype( *x );

-	conj_t    conjbeta  = bli_obj_conj_status( *beta );
 	doff_t    diagoffx  = bli_obj_diag_offset( *x );
 	uplo_t    uplox     = bli_obj_uplo( *x );

@@ -76,21 +74,25 @@ void bli_scalm_unb_var1( obj_t*  beta,
 	inc_t     cs_x      = bli_obj_col_stride( *x );

 	void*     buf_beta;
-	num_t     dt_beta;

 	FUNCPTR_T f;

-	// If beta is a scalar constant, use dt_x to extract the address of the
-	// corresponding constant value; otherwise, use the datatype encoded
-	// within the beta object and extract the buffer at the beta offset.
-	bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
+
+	// Grab the address of the internal scalar buffer for the scalar
+	// attached to x.
+	buf_beta  = bli_obj_internal_scalar_buffer( *x );

 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_beta][dt_x];
+	// NOTE: We use dt_x for both beta and x because beta was obtained
+	// from the attached scalar of x, which is guaranteed to be of the
+	// same datatype as x.
+	f = ftypes[dt_x][dt_x];

 	// Invoke the function.
-	f( conjbeta,
+	// NOTE: We unconditionally pass in BLIS_NO_CONJUGATE for beta
+	// because it would have already been conjugated by the front-end.
+	f( BLIS_NO_CONJUGATE,
 	   diagoffx,
 	   uplox,
 	   m,
--- a/frame/1m/scalm/bli_scalm_unb_var1.h
+++ b/frame/1m/scalm/bli_scalm_unb_var1.h
@@ -32,8 +32,7 @@

 */

-void bli_scalm_unb_var1( obj_t*  beta,
-                         obj_t*  x );
+void bli_scalm_unb_var1( obj_t*  x );


 #undef  GENTPROT2
--- a/frame/1m/setm/bli_setm.c
+++ b/frame/1m/setm/bli_setm.c
@@ -52,7 +52,7 @@ void bli_setm( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of beta.
-	bli_obj_init_scalar_copy_of( dt_x,
+	bli_obj_scalar_init_detached_copy_of( dt_x,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
--- a/frame/2/gemv/bli_gemv.c
+++ b/frame/2/gemv/bli_gemv.c
@@ -78,7 +78,7 @@ void bli_gemv( obj_t*  alpha,
 	// the type union of the target datatypes of a and x to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -89,7 +89,7 @@ void bli_gemv( obj_t*  alpha,
 	// the complex part of beta*y will not be stored. If y is complex and
 	// beta is real then beta is harmlessly promoted to complex.
 	dt_beta = dt_targ_y;
-	bli_obj_init_scalar_copy_of( dt_beta,
+	bli_obj_scalar_init_detached_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
@@ -188,8 +188,8 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m_x * incx; \
 	rs_y = incy; cs_y = m_y * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
-	bli_obj_create_scalar_with_attached_buffer( dt, beta,  &betao  ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m_x, 1,   x, rs_x, cs_x, &xo ); \
--- a/frame/2/gemv/bli_gemv_blk_var1.c
+++ b/frame/2/gemv/bli_gemv_blk_var1.c
@@ -76,8 +76,7 @@ void bli_gemv_blk_var1( obj_t*  alpha,
 		                cntl_sub_packv_y( cntl ) );

 		// Copy/pack A1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a1,
+		bli_packm_int( &a1,
 		               &a1_pack,
 		               cntl_sub_packm_a( cntl ) );
 		bli_packv_int( &y1,
--- a/frame/2/gemv/bli_gemv_blk_var2.c
+++ b/frame/2/gemv/bli_gemv_blk_var2.c
@@ -81,8 +81,7 @@ void bli_gemv_blk_var2( obj_t*  alpha,
 		                cntl_sub_packv_x( cntl ) );

 		// Copy/pack A1, x1 (if needed).
-		bli_packm_int( alpha,
-		               &a1,
+		bli_packm_int( &a1,
 		               &a1_pack,
 		               cntl_sub_packm_a( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/gemv/bli_gemv_unb_var1.c
+++ b/frame/2/gemv/bli_gemv_unb_var1.c
@@ -99,11 +99,11 @@ void bli_gemv_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -99,11 +99,11 @@ void bli_gemv_unb_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -99,11 +99,11 @@ void bli_gemv_unf_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -99,11 +99,11 @@ void bli_gemv_unf_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/ger/bli_ger.c
+++ b/frame/2/ger/bli_ger.c
@@ -75,7 +75,7 @@ void bli_ger( obj_t*  alpha,
 	// the type union of the target datatypes of x and y to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_x, dt_targ_y );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -148,7 +148,7 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m_x * incx; \
 	rs_y = incy; cs_y = m_y * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m_x, 1, x, rs_x, cs_x, &xo ); \
 	bli_obj_create_with_attached_buffer( dt, m_y, 1, y, rs_y, cs_y, &yo ); \
--- a/frame/2/ger/bli_ger_blk_var1.c
+++ b/frame/2/ger/bli_ger_blk_var1.c
@@ -75,8 +75,7 @@ void bli_ger_blk_var1( obj_t* alpha,
 		                cntl_sub_packv_x( cntl ) );

 		// Copy/pack A1, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a1,
+		bli_packm_int( &a1,
 		               &a1_pack,
 		               cntl_sub_packm_a( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/ger/bli_ger_blk_var2.c
+++ b/frame/2/ger/bli_ger_blk_var2.c
@@ -75,8 +75,7 @@ void bli_ger_blk_var2( obj_t* alpha,
 		                cntl_sub_packv_y( cntl ) );

 		// Copy/pack A1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a1,
+		bli_packm_int( &a1,
 		               &a1_pack,
 		               cntl_sub_packm_a( cntl ) );
 		bli_packv_int( &y1,
--- a/frame/2/ger/bli_ger_int.c
+++ b/frame/2/ger/bli_ger_int.c
@@ -95,7 +95,7 @@ void bli_ger_int( conj_t  conjx,
 		bli_obj_toggle_conj( x_local );
 		bli_obj_toggle_conj( y_local );

-		bli_obj_init_scalar_copy_of( bli_obj_datatype( *alpha ),
+		bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *alpha ),
 		                             BLIS_CONJUGATE,
 		                             alpha,
 		                             &alpha_local );
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -94,7 +94,7 @@ void bli_ger_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of x and y. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -94,7 +94,7 @@ void bli_ger_unb_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of x and y. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv.c
+++ b/frame/2/hemv/bli_hemv.c
@@ -78,7 +78,7 @@ void bli_hemv( obj_t*  alpha,
 	// the type union of the target datatypes of a and x to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -89,7 +89,7 @@ void bli_hemv( obj_t*  alpha,
 	// the complex part of beta*y will not be stored. If y is complex and
 	// beta is real then beta is harmlessly promoted to complex.
 	dt_beta = dt_targ_y;
-	bli_obj_init_scalar_copy_of( dt_beta,
+	bli_obj_scalar_init_detached_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
@@ -180,8 +180,8 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m * incx; \
 	rs_y = incy; cs_y = m * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
-	bli_obj_create_scalar_with_attached_buffer( dt, beta,  &betao  ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
--- a/frame/2/hemv/bli_hemv_blk_var1.c
+++ b/frame/2/hemv/bli_hemv_blk_var1.c
@@ -106,8 +106,7 @@ void bli_hemv_blk_var1( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack A11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/hemv/bli_hemv_blk_var2.c
+++ b/frame/2/hemv/bli_hemv_blk_var2.c
@@ -109,8 +109,7 @@ void bli_hemv_blk_var2( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack A11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/hemv/bli_hemv_blk_var3.c
+++ b/frame/2/hemv/bli_hemv_blk_var3.c
@@ -106,8 +106,7 @@ void bli_hemv_blk_var3( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack A11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/hemv/bli_hemv_blk_var4.c
+++ b/frame/2/hemv/bli_hemv_blk_var4.c
@@ -109,8 +109,7 @@ void bli_hemv_blk_var4( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack A11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -101,11 +101,11 @@ void bli_hemv_unb_var1( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -101,11 +101,11 @@ void bli_hemv_unb_var2( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -101,11 +101,11 @@ void bli_hemv_unb_var3( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -101,11 +101,11 @@ void bli_hemv_unb_var4( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -101,11 +101,11 @@ void bli_hemv_unf_var1( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -101,11 +101,11 @@ void bli_hemv_unf_var1a( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -101,11 +101,11 @@ void bli_hemv_unf_var3( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 #if 0
 	obj_t x_copy, y_copy;
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -101,11 +101,11 @@ void bli_hemv_unf_var3a( conj_t  conjh,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// The datatype of beta MUST be the same as the datatype of y.
 	dt_beta   = dt_y;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
+	buf_beta  = bli_obj_buffer_for_1x1( dt_beta, *beta );

 #if 0
 	obj_t x_copy, y_copy;
--- a/frame/2/her/bli_her.c
+++ b/frame/2/her/bli_her.c
@@ -68,7 +68,7 @@ void bli_her( obj_t*  alpha,

 	// Create object to hold a copy-cast of alpha.
 	dt_alpha = dt_targ_x;
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -151,7 +151,7 @@ void PASTEMAC(ch,opname)( \
 \
 	rs_x = incx; cs_x = m * incx; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt_r, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
 	bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
--- a/frame/2/her/bli_her_blk_var1.c
+++ b/frame/2/her/bli_her_blk_var1.c
@@ -90,8 +90,7 @@ void bli_her_blk_var1( conj_t  conjh,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack C11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her/bli_her_blk_var2.c
+++ b/frame/2/her/bli_her_blk_var2.c
@@ -90,8 +90,7 @@ void bli_her_blk_var2( conj_t  conjh,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack C11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her2/bli_her2.c
+++ b/frame/2/her2/bli_her2.c
@@ -75,13 +75,13 @@ void bli_her2( obj_t*  alpha,
 	// Create an object to hold a copy-cast of alpha. Notice that we use
 	// the type union of the datatypes of x and y.
 	dt_alpha = bli_datatype_union( dt_targ_x, dt_targ_y );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );

 	// Also create a conjugated copy of alpha.
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_CONJUGATE,
 	                             alpha,
 	                             &alpha_conj_local );
@@ -171,7 +171,7 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m * incx; \
 	rs_y = incy; cs_y = m * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
--- a/frame/2/her2/bli_her2_blk_var1.c
+++ b/frame/2/her2/bli_her2_blk_var1.c
@@ -101,8 +101,7 @@ void bli_her2_blk_var1( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack C11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her2/bli_her2_blk_var2.c
+++ b/frame/2/her2/bli_her2_blk_var2.c
@@ -104,8 +104,7 @@ void bli_her2_blk_var2( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack C11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her2/bli_her2_blk_var3.c
+++ b/frame/2/her2/bli_her2_blk_var3.c
@@ -104,8 +104,7 @@ void bli_her2_blk_var3( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack C11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her2/bli_her2_blk_var4.c
+++ b/frame/2/her2/bli_her2_blk_var4.c
@@ -101,8 +101,7 @@ void bli_her2_blk_var4( conj_t  conjh,
 		                cntl_sub_packv_y1( cntl ) );

 		// Copy/pack C11, x1, y1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &c11,
+		bli_packm_int( &c11,
 		               &c11_pack,
 		               cntl_sub_packm_c11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/her2/bli_her2_int.c
+++ b/frame/2/her2/bli_her2_int.c
@@ -93,11 +93,11 @@ void bli_her2_int( conj_t   conjh,
 		bli_obj_toggle_conj( x_local );
 		bli_obj_toggle_conj( y_local );

-		bli_obj_init_scalar_copy_of( bli_obj_datatype( *alpha ),
+		bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *alpha ),
 		                             BLIS_CONJUGATE,
 		                             alpha,
 		                             &alpha_local );
-		bli_obj_init_scalar_copy_of( bli_obj_datatype( *alpha_conj ),
+		bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *alpha_conj ),
 		                             BLIS_CONJUGATE,
 		                             alpha_conj,
 		                             &alpha_conj_local );
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -96,7 +96,7 @@ void bli_her2_unb_var1( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -96,7 +96,7 @@ void bli_her2_unb_var2( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -96,7 +96,7 @@ void bli_her2_unb_var3( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -96,7 +96,7 @@ void bli_her2_unb_var4( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -96,7 +96,7 @@ void bli_her2_unf_var1( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -96,7 +96,7 @@ void bli_her2_unf_var4( conj_t   conjh,

 	// The datatype of alpha MUST be the type union of the datatypes of x and y.
 	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/symv/bli_symv.c
+++ b/frame/2/symv/bli_symv.c
@@ -78,7 +78,7 @@ void bli_symv( obj_t*  alpha,
 	// the type union of the target datatypes of a and x to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -89,7 +89,7 @@ void bli_symv( obj_t*  alpha,
 	// the complex part of beta*y will not be stored. If y is complex and
 	// beta is real then beta is harmlessly promoted to complex.
 	dt_beta = dt_targ_y;
-	bli_obj_init_scalar_copy_of( dt_beta,
+	bli_obj_scalar_init_detached_copy_of( dt_beta,
 	                             BLIS_NO_CONJUGATE,
 	                             beta,
 	                             &beta_local );
@@ -180,8 +180,8 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m * incx; \
 	rs_y = incy; cs_y = m * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
-	bli_obj_create_scalar_with_attached_buffer( dt, beta,  &betao  ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
--- a/frame/2/syr/bli_syr.c
+++ b/frame/2/syr/bli_syr.c
@@ -70,7 +70,7 @@ void bli_syr( obj_t*  alpha,
 	// the type union of the target datatypes of x and c to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_x, dt_targ_c );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -152,7 +152,7 @@ void PASTEMAC(ch,opname)( \
 \
 	rs_x = incx; cs_x = m * incx; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
 	bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
--- a/frame/2/syr2/bli_syr2.c
+++ b/frame/2/syr2/bli_syr2.c
@@ -74,7 +74,7 @@ void bli_syr2( obj_t*  alpha,
 	// Create an object to hold a copy-cast of alpha. Notice that we use
 	// the type union of the datatypes of x and y.
 	dt_alpha = bli_datatype_union( dt_targ_x, dt_targ_y );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -164,7 +164,7 @@ void PASTEMAC(ch,opname)( \
 	rs_x = incx; cs_x = m * incx; \
 	rs_y = incy; cs_y = m * incy; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
--- a/frame/2/trmv/bli_trmv.c
+++ b/frame/2/trmv/bli_trmv.c
@@ -70,7 +70,7 @@ void bli_trmv( obj_t*  alpha,
 	// the type union of the target datatypes of a and x to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -150,7 +150,7 @@ void PASTEMAC(ch,opname)( \
 \
 	rs_x = incx; cs_x = m * incx; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
--- a/frame/2/trmv/bli_trmv_l_blk_var1.c
+++ b/frame/2/trmv/bli_trmv_l_blk_var1.c
@@ -80,8 +80,7 @@ void bli_trmv_l_blk_var1( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trmv/bli_trmv_l_blk_var2.c
+++ b/frame/2/trmv/bli_trmv_l_blk_var2.c
@@ -80,8 +80,7 @@ void bli_trmv_l_blk_var2( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trmv/bli_trmv_u_blk_var1.c
+++ b/frame/2/trmv/bli_trmv_u_blk_var1.c
@@ -80,8 +80,7 @@ void bli_trmv_u_blk_var1( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trmv/bli_trmv_u_blk_var2.c
+++ b/frame/2/trmv/bli_trmv_u_blk_var2.c
@@ -80,8 +80,7 @@ void bli_trmv_u_blk_var2( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -88,7 +88,7 @@ void bli_trmv_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -88,7 +88,7 @@ void bli_trmv_unb_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -88,7 +88,7 @@ void bli_trmv_unf_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -88,7 +88,7 @@ void bli_trmv_unf_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trsv/bli_trsv.c
+++ b/frame/2/trsv/bli_trsv.c
@@ -70,7 +70,7 @@ void bli_trsv( obj_t*  alpha,
 	// the type union of the target datatypes of a and x to prevent any
 	// unnecessary loss of information during the computation.
 	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
-	bli_obj_init_scalar_copy_of( dt_alpha,
+	bli_obj_scalar_init_detached_copy_of( dt_alpha,
 	                             BLIS_NO_CONJUGATE,
 	                             alpha,
 	                             &alpha_local );
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname)( \
 \
 	rs_x = incx; cs_x = m * incx; \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
--- a/frame/2/trsv/bli_trsv_l_blk_var1.c
+++ b/frame/2/trsv/bli_trsv_l_blk_var1.c
@@ -85,8 +85,7 @@ void bli_trsv_l_blk_var1( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trsv/bli_trsv_l_blk_var2.c
+++ b/frame/2/trsv/bli_trsv_l_blk_var2.c
@@ -85,8 +85,7 @@ void bli_trsv_l_blk_var2( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trsv/bli_trsv_u_blk_var1.c
+++ b/frame/2/trsv/bli_trsv_u_blk_var1.c
@@ -85,8 +85,7 @@ void bli_trsv_u_blk_var1( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trsv/bli_trsv_u_blk_var2.c
+++ b/frame/2/trsv/bli_trsv_u_blk_var2.c
@@ -85,8 +85,7 @@ void bli_trsv_u_blk_var2( obj_t*  alpha,
 		                cntl_sub_packv_x1( cntl ) );

 		// Copy/pack A11, x1 (if needed).
-		bli_packm_int( &BLIS_ONE,
-		               &a11,
+		bli_packm_int( &a11,
 		               &a11_pack,
 		               cntl_sub_packm_a11( cntl ) );
 		bli_packv_int( &x1,
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -88,7 +88,7 @@ void bli_trsv_unb_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -88,7 +88,7 @@ void bli_trsv_unb_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -88,7 +88,7 @@ void bli_trsv_unf_var1( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -88,7 +88,7 @@ void bli_trsv_unf_var2( obj_t*  alpha,
 	// The datatype of alpha MUST be the type union of a and x. This is to
 	// prevent any unnecessary loss of information during computation.
 	dt_alpha  = bli_datatype_union( dt_a, dt_x );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
+	buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );

 	// Index into the type combination array to extract the correct
 	// function pointer.
--- a/frame/3/gemm/bli_gemm.c
+++ b/frame/3/gemm/bli_gemm.c
@@ -48,21 +48,16 @@ void bli_gemm( obj_t*  alpha,
               obj_t*  c )
 {
 	gemm_t* cntl;
-	obj_t   alpha_local;
-	obj_t   beta_local;
 	obj_t   a_local;
 	obj_t   b_local;
 	obj_t   c_local;
-	num_t   dt_alpha;
-	num_t   dt_beta;
-	bool_t  pack_c;

 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_gemm_check( alpha, a, b, beta, c );

 	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_scalar_equals( alpha, &BLIS_ZERO ) )
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
 	{
 		bli_scalm( beta, c );
 		return;
@@ -86,29 +81,6 @@ void bli_gemm( obj_t*  alpha,
 		bli_obj_induce_trans( c_local );
 	}

-	// Set the target and execution datatypes of the objects, and apply
-	// any transformations necessary to handle mixed domain computation.
-	bli_gemm_set_targ_exec_datatypes( &a_local,
-	                                  &b_local,
-	                                  &c_local,
-	                                  &dt_alpha,
-	                                  &dt_beta,
-	                                  &pack_c );
-
-	// Create an object to hold a copy-cast of alpha.
-	bli_obj_init_scalar_copy_of( dt_alpha,
-	                             BLIS_NO_CONJUGATE,
-	                             alpha,
-	                             &alpha_local );
-
-	// Create an object to hold a copy-cast of beta.
-	bli_obj_init_scalar_copy_of( dt_beta,
-	                             BLIS_NO_CONJUGATE,
-	                             beta,
-	                             &beta_local );
-
-	if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-
 	// Choose the control tree.
 	cntl = gemm_cntl;

@@ -122,10 +94,10 @@ void bli_gemm( obj_t*  alpha,
 #endif

 	// Invoke the internal back-end.
-	bli_gemm_int( &alpha_local,
+	bli_gemm_int( alpha,
 	              &a_local,
 	              &b_local,
-	              &beta_local,
+	              beta,
 	              &c_local,
 	              cntl );
 }
@@ -159,8 +131,8 @@ void PASTEMAC(ch,opname)( \
 	bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
 	bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \
 \
-	bli_obj_create_scalar_with_attached_buffer( dt, alpha, &alphao ); \
-	bli_obj_create_scalar_with_attached_buffer( dt, beta,  &betao  ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
+	bli_obj_create_1x1_with_attached_buffer( dt, beta,  &betao  ); \
 \
 	bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
 	bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -34,10 +34,8 @@

 #include "blis.h"

-void bli_gemm_blk_var1( obj_t*  alpha,
-                        obj_t*  a,
+void bli_gemm_blk_var1( obj_t*  a,
                        obj_t*  b,
-                        obj_t*  beta,
                        obj_t*  c,
                        gemm_t* cntl )
 {
@@ -58,7 +56,7 @@ void bli_gemm_blk_var1( obj_t*  alpha,
 	m_trans = bli_obj_length_after_trans( *a );

 	// Scale C by beta (if instructed).
-	bli_scalm_int( beta,
+	bli_scalm_int( &BLIS_ONE,
 	               c,
 	               cntl_sub_scalm( cntl ) );

@@ -66,9 +64,8 @@ void bli_gemm_blk_var1( obj_t*  alpha,
 	bli_packm_init( b, &b_pack,
 	                cntl_sub_packm_b( cntl ) );

-	// Pack B and scale by alpha (if instructed).
-	bli_packm_int( alpha,
-	               b, &b_pack,
+	// Pack B (if instructed).
+	bli_packm_int( b, &b_pack,
 	               cntl_sub_packm_b( cntl ) );

 	// Partition along the m dimension.
@@ -93,21 +90,19 @@ void bli_gemm_blk_var1( obj_t*  alpha,
 		bli_packm_init( &c1, &c1_pack,
 		                cntl_sub_packm_c( cntl ) );

-		// Pack A1 and scale by alpha (if instructed).
-		bli_packm_int( alpha,
-		               &a1, &a1_pack,
+		// Pack A1 (if instructed).
+		bli_packm_int( &a1, &a1_pack,
 		               cntl_sub_packm_a( cntl ) );

-		// Pack C1 and scale by beta (if instructed).
-		bli_packm_int( beta,
-		               &c1, &c1_pack,
+		// Pack C1 (if instructed).
+		bli_packm_int( &c1, &c1_pack,
 		               cntl_sub_packm_c( cntl ) );

 		// Perform gemm subproblem.
-		bli_gemm_int( alpha,
+		bli_gemm_int( &BLIS_ONE,
 		              &a1_pack,
 		              &b_pack,
-		              beta,
+		              &BLIS_ONE,
 		              &c1_pack,
 		              cntl_sub_gemm( cntl ) );

--- a/frame/3/gemm/bli_gemm_blk_var1.h
+++ b/frame/3/gemm/bli_gemm_blk_var1.h
@@ -32,10 +32,8 @@

 */

-void bli_gemm_blk_var1( obj_t*  alpha,
-                        obj_t*  a,
+void bli_gemm_blk_var1( obj_t*  a,
                        obj_t*  b,
-                        obj_t*  beta,
                        obj_t*  c,
                        gemm_t* cntl );

--- a/Show More
+++ b/Show More