From 08475e7c7653ba598665071a617d10f0d8f763c2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 11 Jun 2013 12:18:39 -0500 Subject: [PATCH] Various level-3 optimizations for row storage. Details: - Implemented remaining two cases within bli_packm_blk_var2(), which allow packing from a lower or upper-stored symmetric/Hermitian matrix to column panels (which are row-stored). Previously one could only pack to row panels (which are column-stored). - Implemented various optimizations in the level-3 front-ends that allow more favorable access through row-stored matrices for gemm, hemm, herk, her2k, symm, syrk, and syr2k. - Cleaned up code in level-3 front-ends that has to do with setting target and execution datatypes. --- frame/1m/packm/bli_packm_blk_var2.c | 118 ++++++++++++--- frame/1m/packm/bli_packm_blk_var3.c | 2 +- frame/3/gemm/bli_gemm.c | 120 +++++---------- frame/3/gemm/bli_gemm_check.c | 6 +- frame/3/gemm/bli_gemm_target.c | 93 ++++++++++++ frame/3/gemm/bli_gemm_target.h | 7 + frame/3/hemm/bli_hemm.c | 55 +++---- frame/3/hemm/bli_hemm_cntl.c | 3 +- frame/3/her2k/bli_her2k.c | 104 ++++++------- frame/3/her2k/bli_her2k.h | 1 + frame/3/her2k/bli_her2k_target.c | 99 +++++++++++++ frame/3/her2k/bli_her2k_target.h | 43 ++++++ frame/3/herk/bli_herk.c | 95 +++++------- frame/3/herk/bli_herk.h | 1 + frame/3/herk/bli_herk_target.c | 203 ++++++++++++++++++++++++++ frame/3/herk/bli_herk_target.h | 66 +++++++++ frame/3/symm/bli_symm.c | 54 +++---- frame/3/syr2k/bli_syr2k.c | 97 +++++------- frame/3/syrk/bli_syrk.c | 93 ++++-------- frame/include/bli_obj_macro_defs.h | 11 ++ frame/include/bli_scalar_macro_defs.h | 8 +- 21 files changed, 871 insertions(+), 408 deletions(-) create mode 100644 frame/3/her2k/bli_her2k_target.c create mode 100644 frame/3/her2k/bli_her2k_target.h create mode 100644 frame/3/herk/bli_herk_target.c create mode 100644 frame/3/herk/bli_herk_target.h diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index ad4e67e09..d0ab7f70e 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -143,6 +143,7 @@ void PASTEMAC(ch,varname )( \ dim_t panel_len; \ doff_t diagoffc_i; \ doff_t diagoffc_inc; \ + doff_t diagoffc_i_abs; \ dim_t panel_dim_i; \ inc_t vs_c; \ inc_t incc, ldc; \ @@ -171,6 +172,7 @@ void PASTEMAC(ch,varname )( \ ctype* restrict p11; \ dim_t p11_m; \ dim_t p11_n; \ + inc_t rs_p11, cs_p11; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ @@ -191,7 +193,7 @@ void PASTEMAC(ch,varname )( \ we are packing to row panels. */ \ if ( bli_is_row_stored( rs_p, cs_p ) ) \ { \ - /* Prepare to pack to column panels. */ \ + /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ panel_len = m; \ panel_dim = pd_p; \ @@ -204,10 +206,12 @@ void PASTEMAC(ch,varname )( \ n_panel = &panel_dim_i; \ m_panel_max = m_max; \ n_panel_max = panel_dim; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ } \ else /* if ( bli_is_col_stored( rs_p, cs_p ) ) */ \ { \ - /* Prepare to pack to row panels. */ \ + /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ panel_len = n; \ panel_dim = pd_p; \ @@ -220,6 +224,8 @@ void PASTEMAC(ch,varname )( \ n_panel = &n; \ m_panel_max = panel_dim; \ n_panel_max = n_max; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ } \ \ /* Compute the total number of iterations we'll need. */ \ @@ -237,11 +243,11 @@ void PASTEMAC(ch,varname )( \ for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ - panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \ + panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \ \ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*ps_p; \ + diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ + c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*ps_p; \ \ /* If the current panel intersects the diagonal and C is either upper- or lower-stored, then we assume C is symmetric or @@ -252,17 +258,76 @@ void PASTEMAC(ch,varname )( \ if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \ bli_is_upper_or_lower( uploc ) ) \ { \ - /* Only two of four cases implemented, since BLIS2 currently does - not support triangular packing of matrix B (which is row-stored). */ \ - /*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ + diagoffc_i_abs = bli_abs( diagoffc_i ); \ +\ + /*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) */ \ + if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ + p10_dim = panel_dim_i; \ + p10_len = diagoffc_i_abs; \ + p10 = p_begin; \ + c10 = c_begin; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim_i; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_i_abs - j; \ + p12 = p_begin + (j )*ldp; \ + c12 = c_begin + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + p11_m = panel_dim_i; \ + p11_n = panel_dim_i; \ + j = diagoffc_i_abs; \ + p11 = p_begin + (j )*ldp; \ + c11 = c_begin + (j )*ldc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ } \ - else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \ + /*else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \ + else /* if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ + p10_dim = panel_dim_i; \ + p10_len = diagoffc_i_abs + panel_dim_i; \ + diagoffc10 = diagoffc_i; \ + p10 = p_begin; \ + c10 = c_begin; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim_i; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_begin + (j )*ldp; \ + c12 = c_begin + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + p11_m = panel_dim_i; \ + p11_n = panel_dim_i; \ + j = diagoffc_i_abs; \ + p11 = p_begin + (j )*ldp; \ + c11 = c_begin + (j )*ldc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ } \ - else*/ if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ +/* + else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ { \ p10_dim = panel_dim_i; \ p10_len = diagoffc_i + panel_dim_i; \ @@ -279,7 +344,7 @@ void PASTEMAC(ch,varname )( \ p12_len = panel_len - p10_len; \ j = p10_len; \ p12 = p_begin + (j )*ldp; \ - c12 = c_begin + (j )*cs_c; \ + c12 = c_begin + (j )*ldc; \ incc12 = incc; \ ldc12 = ldc; \ conjc12 = conjc; \ @@ -288,12 +353,12 @@ void PASTEMAC(ch,varname )( \ p11_n = panel_dim_i; \ j = diagoffc_i; \ p11 = p_begin + (j )*ldp; \ - c11 = c_begin + (j )*cs_c; \ + c11 = c_begin + (j )*ldc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc10 ); \ } \ - else /* if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \ + else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim_i; \ p10_len = diagoffc_i; \ @@ -308,7 +373,7 @@ void PASTEMAC(ch,varname )( \ j = p10_len; \ diagoffc12 = diagoffc_i - j; \ p12 = p_begin + (j )*ldp; \ - c12 = c_begin + (j )*cs_c; \ + c12 = c_begin + (j )*ldc; \ c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ -diagoffc12 * ( doff_t )rs_c; \ incc12 = ldc; \ @@ -319,11 +384,12 @@ void PASTEMAC(ch,varname )( \ p11_n = panel_dim_i; \ j = diagoffc_i; \ p11 = p_begin + (j )*ldp; \ - c11 = c_begin + (j )*cs_c; \ + c11 = c_begin + (j )*ldc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc12 ); \ } \ +*/ \ \ /* Pack to P10. For upper storage, this includes the unstored triangle of C11. */ \ @@ -351,11 +417,16 @@ void PASTEMAC(ch,varname )( \ p11_m, \ p11_n, \ beta_cast, \ - c11, rs_c, cs_c, \ - p11, 1, ldp ); \ + c11, rs_c, cs_c, \ + p11, rs_p11, cs_p11 ); \ } \ else \ { \ + /* Note that the following code executes if the current panel either: + - does not intersect the diagonal, or + - does intersect the diagonal, BUT the matrix is general + which means the entire current panel can be copied at once. */ \ +\ /* We use some c10-specific variables here because we might need to change them if the current panel is unstored. (The values below are used if the current panel is stored.) */ \ @@ -368,8 +439,9 @@ void PASTEMAC(ch,varname )( \ adjustments so we refer to the data where it is actually stored, and so we take conjugation into account. (Note this implicitly assumes we are operating on a symmetric or - Hermitian matrix.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len ) ) \ + Hermitian matrix, since a general matrix would not contain + any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \ { \ c10 = c10 + diagoffc_i * ( doff_t )cs_c + \ -diagoffc_i * ( doff_t )rs_c; \ @@ -440,7 +512,7 @@ void PASTEMAC(ch,varname )( \ p_begin, 1, cs_p, "%4.1f", "" ); \ if ( cs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \ - p_begin, panel_dim, 1, "%6.3f", "" ); \ + p_begin, panel_dim, 1, "%8.5f", "" ); \ */ \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index de5c7074b..3cb38a5fc 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -271,7 +271,7 @@ void PASTEMAC(ch,varname )( \ } \ else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) ) \ { \ - /* Only two of four cases implemented, since BLIS2 currently does + /* Only two of four cases implemented, since BLIS currently does not support triangular packing of matrix B. */ \ /*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \ { \ diff --git a/frame/3/gemm/bli_gemm.c b/frame/3/gemm/bli_gemm.c index c9b22b888..00796210f 100644 --- a/frame/3/gemm/bli_gemm.c +++ b/frame/3/gemm/bli_gemm.c @@ -48,13 +48,12 @@ void bli_gemm( obj_t* alpha, gemm_t* cntl; obj_t alpha_local; obj_t beta_local; - num_t dt_targ_a; - num_t dt_targ_b; - num_t dt_targ_c; - num_t dt_exec; + obj_t a_local; + obj_t b_local; + obj_t c_local; num_t dt_alpha; num_t dt_beta; - bool_t pack_c = FALSE; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -67,105 +66,56 @@ void bli_gemm( obj_t* alpha, return; } - // Determine the target datatype of each matrix object. - bli_gemm_get_target_datatypes( a, - b, - c, - &dt_targ_a, - &dt_targ_b, - &dt_targ_c, - &pack_c ); - - // Set the target datatypes for each matrix object. - bli_obj_set_target_datatype( dt_targ_a, *a ); - bli_obj_set_target_datatype( dt_targ_b, *b ); - bli_obj_set_target_datatype( dt_targ_c, *c ); + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); + bli_obj_alias_to( *b, b_local ); + bli_obj_alias_to( *c, c_local ); - // Determine the execution datatype. Generally speaking, the - // execution datatype is the real projection of the target datatype - // of c. This rule holds unless the target datatypes of a and b - // are both complex, in which case the execution datatype is also - // complex. - if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) ) - dt_exec = dt_targ_c; - else - dt_exec = bli_datatype_proj_to_real( dt_targ_c ); - - // Embed the execution datatype in all matrix operands. - bli_obj_set_execution_datatype( dt_exec, *a ); - bli_obj_set_execution_datatype( dt_exec, *b ); - bli_obj_set_execution_datatype( dt_exec, *c ); - - // Note that the precisions of the target datatypes of a, b, and c - // match. The domains, however, are not necessarily the same. There - // are eight possible combinations of target domains: - // - // case input target exec pack notes - // domain domain domain c? - // c+=a*b c+=a*b - // (0) r r r r r r r - // (1) r r c r r r r b demoted to real - // (2) r c r r r r r a demoted to real - // (3) r c c c c c c yes a*b demoted to real - // (4) c r r r r r r yes copynzm used to update c - // (5) c r c c r c r yes transposed to induce (6) - // (6) c c r c c r r ~ c and a treated as real - // (7) c c c c c c c - // ~ Must pack c if not column-stored (ie: row or general storage). - // - // There are two special cases: (5) and (6). Because the inner kernels - // assume column storage, it is easy to implement (6) since we can - // simply treat matrices c and a as real matrices with inflated m - // dimension and column stride and then proceed with a kernel for real - // computation. We cannot pull the same trick with case (5) because it - // would result in a mismatch in the k dimension. But we can transform - // case (5) into case (6) by transposing all arguments and swapping the - // a and b operands. Also, we will need to pack matrix c. That is what - // we do here. - if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) ) + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. (The effect of the transposition of A and B is negligible + // because those operands are always packed to contiguous memory.) + if ( bli_obj_is_row_stored( *c ) ) { - bli_obj_swap_pointers( a, b ); - bli_swap_types( dt_targ_a, dt_targ_b ); - bli_obj_toggle_trans( *c ); - bli_obj_toggle_trans( *a ); - bli_obj_toggle_trans( *b ); + bli_obj_swap( a_local, b_local ); + + bli_obj_induce_trans( a_local ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); } - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix a. By inspecting the table above, - // this clearly works for cases (0) through (4), (6), and (7). It - // Also works for case (5) since it is transformed into case (6) by - // the above code. - dt_alpha = dt_targ_a; + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_gemm_set_targ_exec_datatypes( &a_local, + &b_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); + + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of c. Here's why: If c is real and beta is complex, - // there is no reason to keep beta_local in the complex domain since - // the complex part of beta*c will not be stored. If c is complex and - // beta is real then beta is harmlessly promoted to complex. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); - // Choose the control tree based on whether it was determined we need - // to pack c. - //if ( pack_c ) gemm_cntl = gemm_cntl_packabc; - //else gemm_cntl = gemm_cntl_packab; + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + // Choose the control tree. cntl = gemm_cntl; - if ( pack_c ) bli_abort(); // Invoke the internal back-end. bli_gemm_int( &alpha_local, - a, - b, + &a_local, + &b_local, &beta_local, - c, + &c_local, cntl ); } diff --git a/frame/3/gemm/bli_gemm_check.c b/frame/3/gemm/bli_gemm_check.c index 1d3d5d96f..4c1d49462 100644 --- a/frame/3/gemm/bli_gemm_check.c +++ b/frame/3/gemm/bli_gemm_check.c @@ -75,8 +75,8 @@ void bli_gemm_basic_check( obj_t* alpha, // We don't enforce general structure in matrix A so we can use gemm to // implement hemm/symm. Instead, we only check this from the front-end. - e_val = bli_check_general_object( b ); - bli_check_error_code( e_val ); + //e_val = bli_check_general_object( b ); + //bli_check_error_code( e_val ); e_val = bli_check_general_object( c ); bli_check_error_code( e_val ); @@ -99,6 +99,8 @@ void bli_gemm_check( obj_t* alpha, e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); } void bli_gemm_int_check( obj_t* alpha, diff --git a/frame/3/gemm/bli_gemm_target.c b/frame/3/gemm/bli_gemm_target.c index 0693301b5..1e938f5a0 100644 --- a/frame/3/gemm/bli_gemm_target.c +++ b/frame/3/gemm/bli_gemm_target.c @@ -34,6 +34,99 @@ #include "blis.h" +void bli_gemm_set_targ_exec_datatypes( obj_t* a, + obj_t* b, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ) +{ + num_t dt_targ_a; + num_t dt_targ_b; + num_t dt_targ_c; + num_t dt_exec; + + // Determine the target datatype of each matrix object. + bli_gemm_get_target_datatypes( a, + b, + c, + &dt_targ_a, + &dt_targ_b, + &dt_targ_c, + pack_c ); + + // Set the target datatypes for each matrix object. + bli_obj_set_target_datatype( dt_targ_a, *a ); + bli_obj_set_target_datatype( dt_targ_b, *b ); + bli_obj_set_target_datatype( dt_targ_c, *c ); + + // Determine the execution datatype. Generally speaking, the + // execution datatype is the real projection of the target datatype + // of c. This rule holds unless the target datatypes of a and b + // are both complex, in which case the execution datatype is also + // complex. + if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) ) + dt_exec = dt_targ_c; + else + dt_exec = bli_datatype_proj_to_real( dt_targ_c ); + + // Embed the execution datatype in all matrix operands. + bli_obj_set_execution_datatype( dt_exec, *a ); + bli_obj_set_execution_datatype( dt_exec, *b ); + bli_obj_set_execution_datatype( dt_exec, *c ); + + // Note that the precisions of the target datatypes of a, b, and c + // match. The domains, however, are not necessarily the same. There + // are eight possible combinations of target domains: + // + // case input target exec pack notes + // domain domain domain c? + // c+=a*b c+=a*b + // (0) r r r r r r r + // (1) r r c r r r r b demoted to real + // (2) r c r r r r r a demoted to real + // (3) r c c c c c c yes a*b demoted to real + // (4) c r r r r r r yes copynzm used to update c + // (5) c r c c r c r yes transposed to induce (6) + // (6) c c r c c r r ~ c and a treated as real + // (7) c c c c c c c + // ~ Must pack c if not column-stored (ie: row or general storage). + // + // There are two special cases: (5) and (6). Because the inner kernels + // assume column storage, it is easy to implement (6) since we can + // simply treat matrices c and a as real matrices with inflated m + // dimension and column stride and then proceed with a kernel for real + // computation. We cannot pull the same trick with case (5) because it + // would result in a mismatch in the k dimension. But we can transform + // case (5) into case (6) by transposing all arguments and swapping the + // a and b operands. Also, we will need to pack matrix c. That is what + // we do here. + if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) ) + { + bli_obj_swap( *a, *b ); + bli_swap_types( dt_targ_a, dt_targ_b ); + bli_obj_toggle_trans( *c ); + bli_obj_toggle_trans( *a ); + bli_obj_toggle_trans( *b ); + } + + // Notice that we use the target datatype of matrix a. By inspecting + // the table above, this clearly works for cases (0) through (4), (6), + // and (7). It also works for case (5) since it is transformed into + // case (6) by the above code. + *dt_alpha = bli_obj_target_datatype( *a ); + + // Notice that we use the target datatype of matrix a. By inspecting + // the table above, this clearly works for cases (0) through (4), (6), + // and (7). It also works for case (5) since it is transformed into + // case (6) by the above code. + *dt_beta = bli_obj_datatype( *c ); + + // For now disable packing of C. + *pack_c = FALSE; +} + + void bli_gemm_get_target_datatypes( obj_t* a, obj_t* b, obj_t* c, diff --git a/frame/3/gemm/bli_gemm_target.h b/frame/3/gemm/bli_gemm_target.h index 19fe693d2..93c27cd31 100644 --- a/frame/3/gemm/bli_gemm_target.h +++ b/frame/3/gemm/bli_gemm_target.h @@ -32,6 +32,13 @@ */ +void bli_gemm_set_targ_exec_datatypes( obj_t* a, + obj_t* b, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ); + void bli_gemm_get_target_datatypes( obj_t* a, obj_t* b, obj_t* c, diff --git a/frame/3/hemm/bli_hemm.c b/frame/3/hemm/bli_hemm.c index c72a0277b..0f7bc3b52 100644 --- a/frame/3/hemm/bli_hemm.c +++ b/frame/3/hemm/bli_hemm.c @@ -52,11 +52,9 @@ void bli_hemm( side_t side, obj_t a_local; obj_t b_local; obj_t c_local; - num_t dt_targ_a; - num_t dt_targ_b; - num_t dt_targ_c; num_t dt_alpha; num_t dt_beta; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -69,47 +67,52 @@ void bli_hemm( side_t side, return; } - // Alias A and B in case we need to induce the right side case. + // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); - // For now, assume the storage datatypes are the desired target - // datatypes. - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_b = bli_obj_datatype( *b ); - dt_targ_c = bli_obj_datatype( *c ); - - // We implement hemm in terms of gemm. But in order to do so we must make - // sure matrix A is on the correct side for our gemm kernel. We assume - // gemm is implemented with a block-panel kernel, thus, we will only - // directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by - // transposing the operation. Since A is Hermitian, we can mark it for - // conjugation instead of transpostion (though transposition should be - // correctly handled as well). - if ( bli_is_right( side ) ) + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. + if ( bli_obj_is_row_stored( *c ) ) { + bli_toggle_side( side ); bli_obj_toggle_conj( a_local ); - bli_obj_toggle_trans( b_local ); - bli_obj_toggle_trans( c_local ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); } - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix A. - dt_alpha = dt_targ_a; + // Swap A and B if multiplying A from the right so that "B" contains + // the Hermitian matrix. + if ( bli_is_right( side ) ) + { + bli_obj_swap( a_local, b_local ); + } + + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_gemm_set_targ_exec_datatypes( &a_local, + &b_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); + + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of C. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + // Choose the control tree. cntl = hemm_cntl; diff --git a/frame/3/hemm/bli_hemm_cntl.c b/frame/3/hemm/bli_hemm_cntl.c index 55549713e..b37f5db4e 100644 --- a/frame/3/hemm/bli_hemm_cntl.c +++ b/frame/3/hemm/bli_hemm_cntl.c @@ -118,7 +118,8 @@ void bli_hemm_cntl_init() hemm_kr, hemm_nr, FALSE, // do NOT scale by alpha - FALSE, // already dense; densify not necessary + //FALSE, // already dense; densify not necessary + TRUE, // densify (if needed) FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? diff --git a/frame/3/her2k/bli_her2k.c b/frame/3/her2k/bli_her2k.c index 81e861d04..4194b5958 100644 --- a/frame/3/her2k/bli_her2k.c +++ b/frame/3/her2k/bli_her2k.c @@ -51,15 +51,13 @@ void bli_her2k( obj_t* alpha, obj_t alpha_conj_local; obj_t beta_local; obj_t c_local; - obj_t ah; - obj_t bh; - num_t dt_targ_a; - num_t dt_targ_b; - num_t dt_targ_c; - num_t dt_exec; + obj_t a_local; + obj_t bh_local; + obj_t b_local; + obj_t ah_local; num_t dt_alpha; num_t dt_beta; - //bool_t pack_c = FALSE; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -72,84 +70,76 @@ void bli_her2k( obj_t* alpha, return; } - // Alias C so we can reset it as the root object (in case it is not - // already a root object). + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); + bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); - // Create objects to track A' and B' (for the second rank-k update). - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah ); - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *b, bh ); + // For her2k, the first and second right-hand "B" operands are simply B' + // and A'. + bli_obj_alias_to( *b, bh_local ); + bli_obj_induce_trans( bh_local ); + bli_obj_toggle_conj( bh_local ); + bli_obj_alias_to( *a, ah_local ); + bli_obj_induce_trans( ah_local ); + bli_obj_toggle_conj( ah_local ); - // Determine the target datatype of each matrix object. - //bli_her2k_get_target_datatypes( a, - // b, - // c, - // &dt_targ_a, - // &dt_targ_b, - // &dt_targ_c, - // &pack_c ); + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. (The effect of the transposition of A and A' is negligible + // because those operands are always packed to contiguous memory.) + if ( bli_obj_is_row_stored( c_local ) ) + { + bli_obj_toggle_conj( a_local ); + bli_obj_toggle_conj( bh_local ); + bli_obj_toggle_conj( b_local ); + bli_obj_toggle_conj( ah_local ); - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_b = bli_obj_datatype( *b ); - dt_targ_c = bli_obj_datatype( *c ); + bli_obj_induce_trans( c_local ); + } - // Set the target datatypes for each matrix object. - bli_obj_set_target_datatype( dt_targ_a, *a ); - bli_obj_set_target_datatype( dt_targ_b, *b ); - bli_obj_set_target_datatype( dt_targ_c, *c ); + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_her2k_set_targ_exec_datatypes( &a_local, + &bh_local, + &b_local, + &ah_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); - // Determine the execution datatype. - dt_exec = dt_targ_a; - - // Embed the execution datatype in all matrix operands. - bli_obj_set_execution_datatype( dt_exec, *a ); - bli_obj_set_execution_datatype( dt_exec, *b ); - bli_obj_set_execution_datatype( dt_exec, *c ); - - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix a. By inspecting the table above, - // this clearly works for cases (0) through (4), (6), and (7). It - // Also works for case (5) since it is transformed into case (6) by - // the above code. - dt_alpha = dt_targ_a; + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of conj(alpha). - dt_alpha = dt_targ_b; bli_obj_init_scalar_copy_of( dt_alpha, BLIS_CONJUGATE, alpha, &alpha_conj_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of c. Here's why: If c is real and beta is complex, - // there is no reason to keep beta_local in the complex domain since - // the complex part of beta*c will not be stored. If c is complex and - // beta is real then beta is harmlessly promoted to complex. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); - // Choose the control tree based on whether it was determined we need - // to pack c. - //if ( pack_c ) her2k_cntl = her2k_cntl_packabc; - //else her2k_cntl = her2k_cntl_packab; + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + // Choose the control tree. cntl = her2k_cntl; - //if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); // Invoke the internal back-end. bli_her2k_int( &alpha_local, - a, - &bh, + &a_local, + &bh_local, &alpha_conj_local, - b, - &ah, + &b_local, + &ah_local, &beta_local, &c_local, cntl ); diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h index 4db5fdcfc..b75188593 100644 --- a/frame/3/her2k/bli_her2k.h +++ b/frame/3/her2k/bli_her2k.h @@ -35,6 +35,7 @@ #include "bli_her2k_cntl.h" #include "bli_her2k_check.h" #include "bli_her2k_int.h" +#include "bli_her2k_target.h" #include "bli_her2k_l_blk_var1.h" #include "bli_her2k_u_blk_var1.h" diff --git a/frame/3/her2k/bli_her2k_target.c b/frame/3/her2k/bli_her2k_target.c new file mode 100644 index 000000000..affde1309 --- /dev/null +++ b/frame/3/her2k/bli_her2k_target.c @@ -0,0 +1,99 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_her2k_set_targ_exec_datatypes( obj_t* a, + obj_t* bh, + obj_t* b, + obj_t* ah, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ) +{ + num_t dt_targ_a; + num_t dt_targ_bh; + num_t dt_targ_b; + num_t dt_targ_ah; + num_t dt_targ_c; + num_t dt_exec; + + // Determine the target datatype of each matrix object. +/* + bli_gemm_get_target_datatypes( a, + bh, + c, + &dt_targ_a, + &dt_targ_bh, + &dt_targ_c, + pack_c ); +*/ + dt_targ_a = bli_obj_datatype( *a ); + dt_targ_bh = bli_obj_datatype( *bh ); + dt_targ_b = bli_obj_datatype( *b ); + dt_targ_ah = bli_obj_datatype( *ah ); + dt_targ_c = bli_obj_datatype( *c ); + dt_exec = dt_targ_a; + + // Set the target datatypes for each matrix object. + bli_obj_set_target_datatype( dt_targ_a, *a ); + bli_obj_set_target_datatype( dt_targ_bh, *bh ); + bli_obj_set_target_datatype( dt_targ_b, *b ); + bli_obj_set_target_datatype( dt_targ_ah, *ah ); + bli_obj_set_target_datatype( dt_targ_c, *c ); + + // Embed the execution datatype in all matrix operands. + bli_obj_set_execution_datatype( dt_exec, *a ); + bli_obj_set_execution_datatype( dt_exec, *bh ); + bli_obj_set_execution_datatype( dt_exec, *b ); + bli_obj_set_execution_datatype( dt_exec, *ah ); + bli_obj_set_execution_datatype( dt_exec, *c ); + + // Notice that we use the target datatype of matrix a. By inspecting + // the table above, this clearly works for cases (0) through (4), (6), + // and (7). It also works for case (5) since it is transformed into + // case (6) by the above code. + *dt_alpha = bli_obj_target_datatype( *a ); + + // Notice that we use the target datatype of matrix a. By inspecting + // the table above, this clearly works for cases (0) through (4), (6), + // and (7). It also works for case (5) since it is transformed into + // case (6) by the above code. + *dt_beta = bli_obj_datatype( *c ); + + // For now disable packing of C. + *pack_c = FALSE; +} + diff --git a/frame/3/her2k/bli_her2k_target.h b/frame/3/her2k/bli_her2k_target.h new file mode 100644 index 000000000..4d06be8c7 --- /dev/null +++ b/frame/3/her2k/bli_her2k_target.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_her2k_set_targ_exec_datatypes( obj_t* a, + obj_t* bh, + obj_t* b, + obj_t* ah, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ); + diff --git a/frame/3/herk/bli_herk.c b/frame/3/herk/bli_herk.c index af6f5ef63..a79f3d81c 100644 --- a/frame/3/herk/bli_herk.c +++ b/frame/3/herk/bli_herk.c @@ -47,15 +47,12 @@ void bli_herk( obj_t* alpha, herk_t* cntl; obj_t alpha_local; obj_t beta_local; + obj_t a_local; + obj_t ah_local; obj_t c_local; - obj_t ah; - num_t dt_targ_a; - num_t dt_targ_ah; - num_t dt_targ_c; - num_t dt_exec; num_t dt_alpha; num_t dt_beta; - //bool_t pack_c = FALSE; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -68,84 +65,58 @@ void bli_herk( obj_t* alpha, return; } - // Alias C so we can reset it as the root object (in case it is not - // already a root object). + // Alias A and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For herk, the right-hand "B" operand is simply A'. - bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah ); + bli_obj_alias_to( *a, ah_local ); + bli_obj_induce_trans( ah_local ); + bli_obj_toggle_conj( ah_local ); - // Determine the target datatype of each matrix object. - //bli_herk_get_target_datatypes( a, - // c, - // &dt_targ_a, - // &dt_targ_c, - // &pack_c ); - - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_ah = bli_obj_datatype( *a ); - dt_targ_c = bli_obj_datatype( *c ); + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. (The effect of the transposition of A and A' is negligible + // because those operands are always packed to contiguous memory.) + if ( bli_obj_is_row_stored( c_local ) ) + { + bli_obj_toggle_conj( a_local ); + bli_obj_toggle_conj( ah_local ); - // Set the target datatypes for each matrix object. - bli_obj_set_target_datatype( dt_targ_a, *a ); - bli_obj_set_target_datatype( dt_targ_ah, ah ); - bli_obj_set_target_datatype( dt_targ_c, *c ); + bli_obj_induce_trans( c_local ); + } - // Determine the execution datatype. For herk, the execution - // datatype is always the target datatype of a. - dt_exec = dt_targ_a; + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_herk_set_targ_exec_datatypes( &a_local, + &ah_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); - // Embed the execution datatype in all matrix operands. - bli_obj_set_execution_datatype( dt_exec, *a ); - bli_obj_set_execution_datatype( dt_exec, ah ); - bli_obj_set_execution_datatype( dt_exec, *c ); - - // Note that the precisions of the target datatypes of a and c - // match. The domains, however, are not necessarily the same. There - // are four possible combinations of target domains: - // - // case input target exec pack notes - // domain domain domain c? - // c+=a*a' c+=a*a' - // (0) r r r r r r r - // (1) r c c c c c c yes a*a' demoted to real - // (2) c r r r r r r yes copynzm used to update c - // (3) c c c c c c c - - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix a. By inspecting the table above, - // this clearly works for cases (0) through (4), (6), and (7). It - // Also works for case (5) since it is transformed into case (6) by - // the above code. - dt_alpha = dt_targ_a; + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of c. Here's why: If c is real and beta is complex, - // there is no reason to keep beta_local in the complex domain since - // the complex part of beta*c will not be stored. If c is complex and - // beta is real then beta is harmlessly promoted to complex. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); - // Choose the control tree based on whether it was determined we need - // to pack c. - //if ( pack_c ) herk_cntl = herk_cntl_packabc; - //else herk_cntl = herk_cntl_packab; + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + // Choose the control tree. cntl = herk_cntl; - //if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); // Invoke the internal back-end. bli_herk_int( &alpha_local, - a, - &ah, + &a_local, + &ah_local, &beta_local, &c_local, cntl ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 6b203aded..1174bbe9c 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -35,6 +35,7 @@ #include "bli_herk_cntl.h" #include "bli_herk_check.h" #include "bli_herk_int.h" +#include "bli_herk_target.h" #include "bli_herk_l_blk_var1.h" #include "bli_herk_u_blk_var1.h" diff --git a/frame/3/herk/bli_herk_target.c b/frame/3/herk/bli_herk_target.c new file mode 100644 index 000000000..a04ee34b2 --- /dev/null +++ b/frame/3/herk/bli_herk_target.c @@ -0,0 +1,203 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_set_targ_exec_datatypes( obj_t* a, + obj_t* ah, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ) +{ + num_t dt_targ_a; + num_t dt_targ_ah; + num_t dt_targ_c; + num_t dt_exec; + + // Determine the target datatype of each matrix object. +/* + bli_herk_get_target_datatypes( a, + c, + &dt_targ_a, + &dt_targ_c, + pack_c ); +*/ + dt_targ_a = bli_obj_datatype( *a ); + dt_targ_ah = bli_obj_datatype( *ah ); + dt_targ_c = bli_obj_datatype( *c ); + dt_exec = dt_targ_a; + + // Set the target datatypes for each matrix object. + bli_obj_set_target_datatype( dt_targ_a, *a ); + bli_obj_set_target_datatype( dt_targ_ah, *ah ); + bli_obj_set_target_datatype( dt_targ_c, *c ); + + // Embed the execution datatype in all matrix operands. + bli_obj_set_execution_datatype( dt_exec, *a ); + bli_obj_set_execution_datatype( dt_exec, *ah ); + bli_obj_set_execution_datatype( dt_exec, *c ); + + *dt_alpha = bli_obj_target_datatype( *a ); + + *dt_beta = bli_obj_datatype( *c ); + + // For now disable packing of C. + *pack_c = FALSE; +} + +/* +void bli_herk_get_target_datatypes( obj_t* a, + obj_t* c, + num_t* dt_a, + num_t* dt_c, + bool_t* pack_c ) +{ + prec_t tp_a, tp_c; + dom_t td_a, td_c; + + // Determine the target domains for each object. + bli_herk_get_target_domain( a, + c, + &td_a, + &td_c, + pack_c ); + + // Determine the target precisions for each object. + bli_herk_get_target_prec( a, + c, + &tp_a, + &tp_c, + pack_c ); + + // The target datatype of an object is simply the union of its + // target domain and target precision. + *dt_a = td_a | tp_a; + *dt_c = td_c | tp_c; +} +*/ +/* +void bli_herk_get_target_domain( obj_t* a, + obj_t* c, + dom_t* td_a, + dom_t* td_c, + bool_t* pack_c ) +{ + dom_t d_a = bli_obj_domain( *a ); + dom_t d_c = bli_obj_domain( *c ); + + // Note that the precisions of the target datatypes of a and c + // match. The domains, however, are not necessarily the same. There + // are four possible combinations of target domains: + // + // case input target exec pack notes + // domain domain domain c? + // c+=a*a' c+=a*a' + // (0) r r r r r r r + // (1) r c c c c c c yes a*a' demoted to real + // (2) c r r r r r r yes copynzm used to update c + // (3) c c c c c c c + + if ( bli_is_real( d_c ) ) + { + if ( bli_is_complex( d_a ) ) + { + *td_c = *td_a = BLIS_COMPLEX; + *pack_c = TRUE; + } + else + { + *td_c = *td_a = BLIS_REAL; + } + } + else // if ( bli_is_complex( d_c ) ) + { + *td_a = d_a; + + if ( bli_is_real( d_a ) ) + { + *td_c = BLIS_REAL; + *pack_c = TRUE; + } + else + { + *td_c = BLIS_COMPLEX; + + if ( bli_obj_is_real( *a ) ) *pack_c = TRUE; + + if ( bli_obj_is_complex( *a ) ) + { + if ( !bli_obj_is_col_stored( *c ) ) *pack_c = TRUE; + } + } + } +} +*/ + +/* +void bli_herk_get_target_prec( obj_t* a, + obj_t* c, + prec_t* tp_a, + prec_t* tp_c, + bool_t* pack_c ) +{ + prec_t p_a = bli_obj_precision( *a ); + prec_t p_c = bli_obj_precision( *c ); + + if ( bli_is_single_prec( p_c ) ) + { + if ( bli_is_double_prec( p_a ) ) + { + *tp_c = *tp_a = BLIS_DOUBLE_PREC; + *pack_c = TRUE; + } + else + { + *tp_c = *tp_a = BLIS_SINGLE_PREC; + } + } + else // if ( bli_is_double_prec( p_c ) ) + { + if ( bli_is_single_prec( p_a ) ) + { + *tp_c = *tp_a = BLIS_SINGLE_PREC; + *pack_c = TRUE; + } + else + { + *tp_c = *tp_a = BLIS_DOUBLE_PREC; + } + } +} +*/ diff --git a/frame/3/herk/bli_herk_target.h b/frame/3/herk/bli_herk_target.h new file mode 100644 index 000000000..0610c690a --- /dev/null +++ b/frame/3/herk/bli_herk_target.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk_set_targ_exec_datatypes( obj_t* a, + obj_t* ah, + obj_t* c, + num_t* dt_alpha, + num_t* dt_beta, + bool_t* pack_c ); + +/* +void bli_herk_get_target_datatypes( obj_t* a, + obj_t* b, + obj_t* c, + num_t* dt_a, + num_t* dt_b, + num_t* dt_c, + bool_t* pack_c ); + +void bli_herk_get_target_domain( obj_t* a, + obj_t* b, + obj_t* c, + dom_t* td_a, + dom_t* td_b, + dom_t* td_c, + bool_t* pack_c ); + +void bli_herk_get_target_prec( obj_t* a, + obj_t* b, + obj_t* c, + prec_t* tp_a, + prec_t* tp_b, + prec_t* tp_c, + bool_t* pack_c ); +*/ diff --git a/frame/3/symm/bli_symm.c b/frame/3/symm/bli_symm.c index 414b9bae1..3f8c8e36c 100644 --- a/frame/3/symm/bli_symm.c +++ b/frame/3/symm/bli_symm.c @@ -52,11 +52,9 @@ void bli_symm( side_t side, obj_t a_local; obj_t b_local; obj_t c_local; - num_t dt_targ_a; - num_t dt_targ_b; - num_t dt_targ_c; num_t dt_alpha; num_t dt_beta; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -69,45 +67,51 @@ void bli_symm( side_t side, return; } - // For now, assume the storage datatypes are the desired target - // datatypes. - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_b = bli_obj_datatype( *b ); - dt_targ_c = bli_obj_datatype( *c ); - - // Alias A and B in case we need to induce the right side case. + // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); - // We implement symm in terms of gemm. But in order to do so we must make - // sure matrix A is on the correct side for our gemm kernel. We assume - // gemm is implemented with a block-panel kernel, thus, we will only - // directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by - // transposing the operation. Since A is symmetric, we do not mark it - // for any conjugation or transposition. - if ( bli_is_right( side ) ) + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. + if ( bli_obj_is_row_stored( *c ) ) { - bli_obj_toggle_trans( b_local ); - bli_obj_toggle_trans( c_local ); + bli_toggle_side( side ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); } - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix A. - dt_alpha = dt_targ_a; + // Swap A and B if multiplying A from the right so that "B" contains + // the symmetric matrix. + if ( bli_is_right( side ) ) + { + bli_obj_swap( a_local, b_local ); + } + + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_gemm_set_targ_exec_datatypes( &a_local, + &b_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); + + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of C. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + // Choose the control tree. We can just use hemm since the algorithm // is nearly identical to that of symm. cntl = hemm_cntl; diff --git a/frame/3/syr2k/bli_syr2k.c b/frame/3/syr2k/bli_syr2k.c index 22df3491c..f5b5a0b1f 100644 --- a/frame/3/syr2k/bli_syr2k.c +++ b/frame/3/syr2k/bli_syr2k.c @@ -50,15 +50,13 @@ void bli_syr2k( obj_t* alpha, obj_t alpha_local; obj_t beta_local; obj_t c_local; - obj_t at; - obj_t bt; - num_t dt_targ_a; - num_t dt_targ_b; - num_t dt_targ_c; - num_t dt_exec; + obj_t a_local; + obj_t bt_local; + obj_t b_local; + obj_t at_local; num_t dt_alpha; num_t dt_beta; - //bool_t pack_c = FALSE; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -71,69 +69,52 @@ void bli_syr2k( obj_t* alpha, return; } - // Alias C so we can reset it as the root object (in case it is not - // already a root object). + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); + bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); - // Create objects to track A^T and B^T (for the second rank-k update). - bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at ); - bli_obj_alias_with_trans( BLIS_TRANSPOSE, *b, bt ); + // For syr2k, the first and second right-hand "B" operands are simply B' + // and A'. + bli_obj_alias_to( *b, bt_local ); + bli_obj_induce_trans( bt_local ); + bli_obj_alias_to( *a, at_local ); + bli_obj_induce_trans( at_local ); - // Determine the target datatype of each matrix object. - //bli_syr2k_get_target_datatypes( a, - // b, - // c, - // &dt_targ_a, - // &dt_targ_b, - // &dt_targ_c, - // &pack_c ); + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. (The effect of the transposition of A and A' is negligible + // because those operands are always packed to contiguous memory.) + if ( bli_obj_is_row_stored( c_local ) ) + { + bli_obj_induce_trans( c_local ); + } - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_b = bli_obj_datatype( *b ); - dt_targ_c = bli_obj_datatype( *c ); + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_her2k_set_targ_exec_datatypes( &a_local, + &bt_local, + &b_local, + &at_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); - // Set the target datatypes for each matrix object. - bli_obj_set_target_datatype( dt_targ_a, *a ); - bli_obj_set_target_datatype( dt_targ_b, *b ); - bli_obj_set_target_datatype( dt_targ_c, *c ); - - // Determine the execution datatype. - dt_exec = dt_targ_a; - - // Embed the execution datatype in all matrix operands. - bli_obj_set_execution_datatype( dt_exec, *a ); - bli_obj_set_execution_datatype( dt_exec, *b ); - bli_obj_set_execution_datatype( dt_exec, *c ); - - - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix a. By inspecting the table above, - // this clearly works for cases (0) through (4), (6), and (7). It - // Also works for case (5) since it is transformed into case (6) by - // the above code. - dt_alpha = dt_targ_a; + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of c. Here's why: If c is real and beta is complex, - // there is no reason to keep beta_local in the complex domain since - // the complex part of beta*c will not be stored. If c is complex and - // beta is real then beta is harmlessly promoted to complex. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); - // Choose the control tree based on whether it was determined we need - // to pack c. - //if ( pack_c ) syr2k_cntl = her2k_cntl_packabc; - //else syr2k_cntl = her2k_cntl_packab; - //if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); // Choose the control tree. We can just use her2k since the algorithm // is nearly identical to that of syr2k. @@ -141,11 +122,11 @@ void bli_syr2k( obj_t* alpha, // Invoke the internal back-end. bli_her2k_int( &alpha_local, - a, - &bt, + &a_local, + &bt_local, &alpha_local, - b, - &at, + &b_local, + &at_local, &beta_local, &c_local, cntl ); diff --git a/frame/3/syrk/bli_syrk.c b/frame/3/syrk/bli_syrk.c index 6b8680f2b..5d0239948 100644 --- a/frame/3/syrk/bli_syrk.c +++ b/frame/3/syrk/bli_syrk.c @@ -47,15 +47,12 @@ void bli_syrk( obj_t* alpha, herk_t* cntl; obj_t alpha_local; obj_t beta_local; + obj_t a_local; + obj_t at_local; obj_t c_local; - obj_t at; - num_t dt_targ_a; - num_t dt_targ_at; - num_t dt_targ_c; - num_t dt_exec; num_t dt_alpha; num_t dt_beta; - //bool_t pack_c = FALSE; + bool_t pack_c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -68,78 +65,46 @@ void bli_syrk( obj_t* alpha, return; } - // Alias C so we can reset it as the root object (in case it is not - // already a root object). + // Alias A and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); - // For syrk, the right-hand "B" operand is simply A^T. - bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at ); + // For herk, the right-hand "B" operand is simply A^T. + bli_obj_alias_to( *a, at_local ); + bli_obj_induce_trans( at_local ); - // Determine the target datatype of each matrix object. - //bli_syrk_get_target_datatypes( a, - // c, - // &dt_targ_a, - // &dt_targ_c, - // &pack_c ); - - dt_targ_a = bli_obj_datatype( *a ); - dt_targ_at = bli_obj_datatype( *a ); - dt_targ_c = bli_obj_datatype( *c ); + // An optimization: If C is row-stored, transpose the entire operation + // so as to allow the macro-kernel more favorable access patterns + // through C. (The effect of the transposition of A and A^T is negligible + // because those operands are always packed to contiguous memory.) + if ( bli_obj_is_row_stored( c_local ) ) + { + bli_obj_induce_trans( c_local ); + } - // Set the target datatypes for each matrix object. - bli_obj_set_target_datatype( dt_targ_a, *a ); - bli_obj_set_target_datatype( dt_targ_at, at ); - bli_obj_set_target_datatype( dt_targ_c, *c ); + // Set the target and execution datatypes of the objects, and apply + // any transformations necessary to handle mixed domain computation. + bli_herk_set_targ_exec_datatypes( &a_local, + &at_local, + &c_local, + &dt_alpha, + &dt_beta, + &pack_c ); - // Determine the execution datatype. For syrk, the execution - // datatype is always the target datatype of a. - dt_exec = dt_targ_a; - - // Embed the execution datatype in all matrix operands. - bli_obj_set_execution_datatype( dt_exec, *a ); - bli_obj_set_execution_datatype( dt_exec, at ); - bli_obj_set_execution_datatype( dt_exec, *c ); - - // Note that the precisions of the target datatypes of a and c - // match. The domains, however, are not necessarily the same. There - // are four possible combinations of target domains: - // - // case input target exec pack notes - // domain domain domain c? - // c+=a*a' c+=a*a' - // (0) r r r r r r r - // (1) r c c c c c c yes a*a^T demoted to real - // (2) c r r r r r r yes copynzm used to update c - // (3) c c c c c c c - - // Create an object to hold a copy-cast of alpha. Notice that we use - // the target datatype of matrix a. By inspecting the table above, - // this clearly works for cases (0) through (4), (6), and (7). It - // Also works for case (5) since it is transformed into case (6) by - // the above code. - dt_alpha = dt_targ_a; + // Create an object to hold a copy-cast of alpha. bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); - // Create an object to hold a copy-cast of beta. Notice that we use - // the datatype of c. Here's why: If c is real and beta is complex, - // there is no reason to keep beta_local in the complex domain since - // the complex part of beta*c will not be stored. If c is complex and - // beta is real then beta is harmlessly promoted to complex. - dt_beta = bli_obj_datatype( *c ); + // Create an object to hold a copy-cast of beta. bli_obj_init_scalar_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); - // Choose the control tree based on whether it was determined we need - // to pack c. - //if ( pack_c ) syrk_cntl = herk_cntl_packabc; - //else syrk_cntl = herk_cntl_packab; - //if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); // Choose the control tree. We can just use herk since the algorithm // is nearly identical to that of syrk. @@ -147,8 +112,8 @@ void bli_syrk( obj_t* alpha, // Invoke the internal back-end. bli_herk_int( &alpha_local, - a, - &at, + &a_local, + &at_local, &beta_local, &c_local, cntl ); diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 540468123..d491895cf 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -894,6 +894,16 @@ bli_obj_width_stored( obj ) : ( bli_obj_buffer_at_off( obj ) ) \ ) + +// Swap objects + +#define bli_obj_swap( a, b ) \ +{ \ + obj_t t; \ + t = b; b = a; a = t; \ +} + + // Swap object pointers #define bli_obj_swap_pointers( a, b ) \ @@ -902,6 +912,7 @@ bli_obj_width_stored( obj ) t = b; b = a; a = t; \ } + // If a transposition is needed, induce one: swap dimensions, increments // and offsets, and then clear the trans bit. diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 11d163924..21df6e25b 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -127,13 +127,13 @@ // min, max, abs -#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) -#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) -#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) ) +#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) +#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) +#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) ) // fmin, fmax, fabs -#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) +#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) < 0.0 ? -(a) : (a) )