diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index c82392b60..d1c4ef828 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -52,8 +52,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h index a0f5b574d..d7adddf3c 100644 --- a/config/zen2/bli_family_zen2.h +++ b/config/zen2/bli_family_zen2.h @@ -51,8 +51,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 94e37fc17..da9348844 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -84,11 +84,7 @@ // Operation-specific headers. #include "bli_gemm.h" #include "bli_hemm.h" -#include "bli_herk.h" -#include "bli_her2k.h" #include "bli_symm.h" -#include "bli_syrk.h" -#include "bli_syr2k.h" #include "bli_trmm.h" #include "bli_trmm3.h" #include "bli_trsm.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 58b658d1d..1986b3b0f 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -51,8 +51,8 @@ dim_t bli_l3_determine_kc if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_HERK ) - return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_GEMMT ) + return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRMM ) return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRSM ) @@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \ } GENFRONT( gemm_determine_kc, gemm ) -GENFRONT( herk_determine_kc, herk ) +GENFRONT( gemmt_determine_kc, gemmt ) GENFRONT( trmm_determine_kc, trmm ) GENFRONT( trsm_determine_kc, trsm ) @@ -201,7 +201,7 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ - /* Notice that for herk, we do not need to perform any special handling + /* Notice that for gemmt, we do not need to perform any special handling for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined @@ -211,8 +211,8 @@ dim_t PASTEMAC0(opname) \ return b_use; \ } -GENFRONT( herk_determine_kc_f, f ) -GENFRONT( herk_determine_kc_b, b ) +GENFRONT( gemmt_determine_kc_f, f ) +GENFRONT( gemmt_determine_kc_b, b ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index c3301ee13..3ea3c5aa0 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -60,7 +60,7 @@ dim_t PASTEMAC0(opname) \ ); GENPROT( gemm_determine_kc ) -GENPROT( herk_determine_kc ) +GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) @@ -81,8 +81,8 @@ dim_t PASTEMAC0(opname) \ GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) -GENPROT( herk_determine_kc_f ) -GENPROT( herk_determine_kc_b ) +GENPROT( gemmt_determine_kc_f ) +GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 413f6a58d..50da4627c 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -597,10 +597,5 @@ void bli_l3_basic_check e_val = bli_check_object_buffer( c ); bli_check_error_code( e_val ); - - // Check for sufficiently sized stack buffers - - e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx ); - bli_check_error_code( e_val ); } diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index f6bfbedbb..3cdecfbc2 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -54,7 +54,7 @@ void bli_l3_cntl_create_if if ( cntl_orig == NULL ) { if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { *cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b ); @@ -97,7 +97,7 @@ void bli_l3_cntl_free opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { bli_gemm_cntl_free( rntm, cntl_use, thread ); diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 7baf2d6ef..0d0a71921 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -46,7 +46,7 @@ dir_t bli_l3_direct opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); - else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c ); else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); @@ -68,14 +68,14 @@ dir_t bli_gemm_direct return BLIS_FWD; } -dir_t bli_herk_direct +dir_t bli_gemmt_direct ( obj_t* a, obj_t* b, obj_t* c ) { - // For herk, movement may be forwards (or backwards). + // For gemmt, movement may be forwards (or backwards). return BLIS_FWD; } diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7383c4a9f..39798407a 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -53,7 +53,7 @@ dir_t PASTEMAC0(opname) \ ); GENPROT( gemm_direct ) -GENPROT( herk_direct ) +GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c index 7c30f61af..fbf73be60 100644 --- a/frame/3/bli_l3_ind.c +++ b/frame/3/bli_l3_ind.c @@ -55,7 +55,8 @@ static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = static BLIS_THREAD_LOCAL bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { - /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ + /* gemm gemmt hemm herk her2k symm + syrk syr2k trmm3 trmm trsm */ /* c z */ /* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, @@ -80,11 +81,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \ GENFUNC( gemm, BLIS_GEMM ) GENFUNC( gemmt, BLIS_GEMMT ) GENFUNC( hemm, BLIS_HEMM ) -GENFUNC( herk, BLIS_HERK ) -GENFUNC( her2k, BLIS_HER2K ) GENFUNC( symm, BLIS_SYMM ) -GENFUNC( syrk, BLIS_SYRK ) -GENFUNC( syr2k, BLIS_SYR2K ) GENFUNC( trmm3, BLIS_TRMM3 ) GENFUNC( trmm, BLIS_TRMM ) GENFUNC( trsm, BLIS_TRSM ) diff --git a/frame/3/bli_l3_ind.h b/frame/3/bli_l3_ind.h index f80757eb0..a14ad783c 100644 --- a/frame/3/bli_l3_ind.h +++ b/frame/3/bli_l3_ind.h @@ -47,11 +47,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) -GENPROT( herk ) -GENPROT( her2k ) GENPROT( symm ) -GENPROT( syrk ) -GENPROT( syr2k ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index f6cfd6640..cd0df7017 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -38,301 +38,508 @@ // Define object-based interfaces (expert). // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* If the rntm is non-NULL, it may indicate that we should forgo sup - handling altogether. */ \ - bool enable_sup = TRUE; \ - if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \ -\ - if ( enable_sup ) \ - { \ - /* Execute the small/unpacked oapi handler. If it finds that the problem - does not fall within the thresholds that define "small", or for some - other reason decides not to use the small/unpacked implementation, - the function returns with BLIS_FAILURE, which causes execution to - proceed towards the conventional implementation. */ \ - err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \ - if ( result == BLIS_SUCCESS ) \ - { \ - return; \ - } \ - } \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If each matrix operand has a complex storage datatype, try to get an - induced method (if one is available and enabled). NOTE: Allowing - precisions to vary while using 1m, which is what we do here, is unique - to gemm; other level-3 operations use 1m only if all storage datatypes - are equal (and they ignore the computation precision). */ \ - if ( bli_obj_is_complex( c ) && \ - bli_obj_is_complex( a ) && \ - bli_obj_is_complex( b ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \ -} - // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be // defined in the sandbox environment. #ifndef BLIS_ENABLE_SANDBOX -GENFRONT( gemm ) + +void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // If the rntm is non-NULL, it may indicate that we should forgo sup + // handling altogether. + bool enable_sup = TRUE; + if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); + + if ( enable_sup ) + { + // Execute the small/unpacked oapi handler. If it finds that the problem + // does not fall within the thresholds that define "small", or for some + // other reason decides not to use the small/unpacked implementation, + // the function returns with BLIS_FAILURE, which causes execution to + // proceed towards the conventional implementation. + err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm ); + if ( result == BLIS_SUCCESS ) + { + return; + } + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If each matrix operand has a complex storage datatype, try to get an + // induced method (if one is available and enabled). NOTE: Allowing + // precisions to vary while using 1m, which is what we do here, is unique + // to gemm; other level-3 operations use 1m only if all storage datatypes + // are equal (and they ignore the computation precision). + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_gemmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL ); +} + #endif -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \ +void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_gemmtind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemmt_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL ); } -GENFRONT( gemmt ) -GENFRONT( her2k ) -GENFRONT( syr2k ) +void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( side, alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, beta, c, cntx, rntm, NULL ); \ + obj_t ah; + obj_t bh; + obj_t alphah; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_her2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_alias_to( alpha, &alphah ); + bli_obj_toggle_conj( &alphah ); + + bli_obj_alias_to( a, &ah ); + bli_obj_toggle_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + bli_obj_alias_to( b, &bh ); + bli_obj_toggle_trans( &bh ); + bli_obj_toggle_conj( &bh ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bh, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); + + // The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for + // the diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-2k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); } -GENFRONT( hemm ) -GENFRONT( symm ) -GENFRONT( trmm3 ) +void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \ + obj_t at; + obj_t bt; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syr2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_alias_to( b, &bt ); + bli_obj_toggle_trans( &bt ); + + bli_obj_alias_to( a, &at ); + bli_obj_toggle_trans( &at ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); } -GENFRONT( herk ) -GENFRONT( syrk ) +void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( b ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ - bli_obj_is_complex( b ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( side, alpha, a, b, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx, rntm, NULL ); \ + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_hemmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_hemm_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); } -GENFRONT( trmm ) -GENFRONT( trsm ) +void PASTEMAC(symm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_symmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_symm_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); +} + + +void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trmm3ind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trmm3_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); +} + + +void PASTEMAC(herk,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t ah; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_herk_check( alpha, a, beta, c, cntx ); + + bli_obj_alias_to( a, &ah ); + bli_obj_toggle_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm ); + + // The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the + // diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); +} + + +void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t at; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syrk_check( alpha, a, beta, c, cntx ); + + bli_obj_alias_to( a, &at ); + bli_obj_toggle_trans( &at ); + + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm ); +} + + +void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( b ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trmmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trmm_check( side, alpha, a, b, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL ); +} + + +void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( b ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trsmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trsm_check( side, alpha, a, b, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL ); +} diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index fa008fd15..6ca8244cb 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -47,7 +47,7 @@ void bli_l3_prune_unref_mparts_m opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. - else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); } @@ -68,7 +68,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ - else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ } @@ -152,7 +152,7 @@ void PASTEMAC(opname,_prune_unref_mparts_k) \ for the k dimension. */ \ } -GENFRONT( herk ) +GENFRONT( gemmt ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 340ecd4db..ad8f07dc4 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -64,9 +64,9 @@ GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) -GENPROT( herk, m ) -GENPROT( herk, n ) -GENPROT( herk, k ) +GENPROT( gemmt, m ) +GENPROT( gemmt, n ) +GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 4726e1042..37a3909fd 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -44,12 +44,12 @@ #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) -// herk +// gemmt -// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to +// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 94f0af409..7883dfd6d 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -93,7 +93,7 @@ void bli_gemm_blk_var3 // can simply overwrite the internal beta scalar with BLIS_ONE once // it has been used in the first iteration. However... - // Unlike variant 3 of gemm and herk, which reset the internal scalar + // Unlike variant 3 of gemm and gemmt, which reset the internal scalar // on C at the end of the first iteration so that subsequent iterations // do not erroneously apply beta more than once, it is important that // this behavior not be applied to trmm. That is because the order of diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index d7cd0a92c..27678e0bf 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -63,7 +63,7 @@ cntl_t* bli_gemmbp_cntl_create // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; - else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2; + else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; @@ -167,8 +167,8 @@ cntl_t* bli_gemmpb_cntl_create { void_fp macro_kernel_p = bli_gemm_ker_var1; - // Change the macro-kernel if the operation family is herk or trmm. - //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + // Change the macro-kernel if the operation family is gemmt or trmm. + //if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2; //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h index ed522ee13..32ab3865e 100644 --- a/frame/3/gemmt/bli_gemmt.h +++ b/frame/3/gemmt/bli_gemmt.h @@ -34,3 +34,5 @@ #include "bli_gemmt_front.h" +#include "bli_gemmt_var.h" + diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 84385bf17..9f18a717d 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -108,7 +108,7 @@ void bli_gemmt_front bli_l3_thread_decorator ( bli_gemm_int, - BLIS_HERK, // operation family id (gemmt uses 'herk' family) + BLIS_GEMMT, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_l_ker_var2.c rename to frame/3/gemmt/bli_gemmt_l_ker_var2.c index 5a05672d7..a995e6c52 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); -void bli_herk_l_ker_var2 +void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, @@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -464,11 +464,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -551,5 +551,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_u_ker_var2.c rename to frame/3/gemmt/bli_gemmt_u_ker_var2.c index 9e685a944..3115fc67b 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); -void bli_herk_u_ker_var2 +void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, @@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -490,11 +490,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -554,5 +554,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/gemmt/bli_gemmt_var.h similarity index 90% rename from frame/3/herk/bli_herk_var.h rename to frame/3/gemmt/bli_gemmt_var.h index 00b85fc5c..60c68c9f5 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -52,16 +52,10 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -//GENPROT( herk_blk_var1 ) -//GENPROT( herk_blk_var2 ) -//GENPROT( herk_blk_var3 ) +GENPROT( gemmt_x_ker_var2 ) -GENPROT( herk_x_ker_var2 ) - -GENPROT( herk_l_ker_var2 ) -GENPROT( herk_u_ker_var2 ) -//GENPROT( herk_packa ) -//GENPROT( herk_packb ) +GENPROT( gemmt_l_ker_var2 ) +GENPROT( gemmt_u_ker_var2 ) // @@ -91,6 +85,6 @@ void PASTEMAC(ch,varname) \ thrinfo_t* thread \ ); -INSERT_GENTPROT_BASIC0( herk_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( herk_u_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_x_ker_var2.c rename to frame/3/gemmt/bli_gemmt_x_ker_var2.c index b6769d719..6d24ea496 100644 --- a/frame/3/herk/bli_herk_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -37,10 +37,10 @@ static gemm_var_oft vars[2] = { - bli_herk_l_ker_var2, bli_herk_u_ker_var2, + bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; -void bli_herk_x_ker_var2 +void bli_gemmt_x_ker_var2 ( obj_t* a, obj_t* ah, diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c similarity index 97% rename from frame/3/herk/other/bli_herk_l_ker_var2.c rename to frame/3/gemmt/other/bli_gemmt_l_ker_var2.c index 22439f5b2..0bf4b1a0f 100644 --- a/frame/3/herk/other/bli_herk_l_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); -void bli_herk_l_ker_var2 +void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, @@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ @@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c similarity index 97% rename from frame/3/herk/other/bli_herk_u_ker_var2.c rename to frame/3/gemmt/other/bli_gemmt_u_ker_var2.c index 1aa3ce12d..1655bea55 100644 --- a/frame/3/herk/other/bli_herk_u_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); -void bli_herk_u_ker_var2 +void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, @@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ @@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h deleted file mode 100644 index 02975c2b5..000000000 --- a/frame/3/her2k/bli_her2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_her2k_front.h" - diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c deleted file mode 100644 index 459ab05c7..000000000 --- a/frame/3/her2k/bli_her2k_front.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t alpha_conj; - obj_t c_local; - obj_t a_local; - obj_t bh_local; - obj_t b_local; - obj_t ah_local; - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For her2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bh_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_toggle_conj( &bh_local ); - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_swap( &a_local, &bh_local ); - bli_obj_swap( &b_local, &ah_local ); - - bli_obj_induce_trans( &a_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_induce_trans( &b_local ); - bli_obj_induce_trans( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx ); - bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx ); - - // Initialize a conjugated copy of alpha. - bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), - BLIS_CONJUGATE, - alpha, - &alpha_conj ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HER2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bh_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-2k product was computed as A*B'+B*A', even for - // the diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-2k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h deleted file mode 100644 index 0efdb86c2..000000000 --- a/frame/3/her2k/bli_her2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h deleted file mode 100644 index c43728968..000000000 --- a/frame/3/herk/bli_herk.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_herk_front.h" - -#include "bli_herk_var.h" - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c deleted file mode 100644 index 324e18151..000000000 --- a/frame/3/herk/bli_herk_front.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t ah_local; - obj_t c_local; - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For herk, the right-hand "B" operand is simply A'. - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_toggle_conj( &a_local ); - bli_obj_toggle_conj( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HERK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-k product was computed as A*A', even for the - // diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h deleted file mode 100644 index 44778a450..000000000 --- a/frame/3/herk/bli_herk_front.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c deleted file mode 100644 index 8a99a2e24..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); - - -void bli_herk_l_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c deleted file mode 100644 index c78a36b29..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_l_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and - 1st loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c deleted file mode 100644 index 17e0b0d0e..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_l_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd - loop and slab partitioning in the 1st loop for the remaining - triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c deleted file mode 100644 index 31d8fab62..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); - - -void bli_herk_u_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c deleted file mode 100644 index 085ef6308..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_u_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial triangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c deleted file mode 100644 index abc6e5188..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c +++ /dev/null @@ -1,558 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_u_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd loop - and slab partitioning in the 1st loop for the initial triangular region - of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st loops - loop for the remaining triangular region of C. */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl ) - diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h deleted file mode 100644 index 680e6e399..000000000 --- a/frame/3/syr2k/bli_syr2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syr2k_front.h" - diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c deleted file mode 100644 index 4f30cc3d5..000000000 --- a/frame/3/syr2k/bli_syr2k_front.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t c_local; - obj_t a_local; - obj_t bt_local; - obj_t b_local; - obj_t at_local; - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syr2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bt_local ); - bli_obj_induce_trans( &bt_local ); - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx ); - bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYR2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bt_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h deleted file mode 100644 index 767bb6ee1..000000000 --- a/frame/3/syr2k/bli_syr2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h deleted file mode 100644 index 4936fe431..000000000 --- a/frame/3/syrk/bli_syrk.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syrk_front.h" - diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c deleted file mode 100644 index 819941426..000000000 --- a/frame/3/syrk/bli_syrk_front.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t at_local; - obj_t c_local; - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syrk, the right-hand "B" operand is simply A^T. - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - -#if 0 -#ifdef BLIS_ENABLE_SMALL_MATRIX - gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, - cntx, cntl ); - if ( status == BLIS_SUCCESS ) return; -#endif -#endif - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYRK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &at_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h deleted file mode 100644 index bf8d26a52..000000000 --- a/frame/3/syrk/bli_syrk_front.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_syrk_small - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index 78d139e6b..e76314036 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -819,22 +819,26 @@ err_t bli_check_if_exhausted_pool( pool_t* pool ) return e_val; } -err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ) +err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ) { err_t e_val = BLIS_SUCCESS; + num_t dt; - dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - siz_t dt_size = bli_dt_size( dt ); + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + siz_t dt_size = bli_dt_size( dt ); - // NOTE: For induced methods, we use the size of the complex datatypes - // (rather than the size of the native micro-kernels' datatype) because - // the macro-kernel needs this larger micro-tile footprint, even if the - // virtual micro-kernel implementation will only ever be writing to half - // of it (real or imaginary part) at a time. + // NOTE: For induced methods, we use the size of the complex datatypes + // (rather than the size of the native micro-kernels' datatype) because + // the macro-kernel needs this larger micro-tile footprint, even if the + // virtual micro-kernel implementation will only ever be writing to half + // of it (real or imaginary part) at a time. - if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE ) - e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE; + if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE ) + e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE; + } return e_val; } diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index 70ec2fd8f..276d27689 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -103,7 +103,7 @@ err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); -err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ); +err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index c250191fc..0a5bcafd4 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -449,6 +449,11 @@ void bli_gks_register_cntx e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val ); #endif + + // Verify that the register blocksizes in the context are sufficiently large + // relative to the maximum stack buffer size defined at configure-time. + e_val = bli_check_sufficient_stack_buf_size( gks_id_nat ); + bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fa7901583..8a3dcd30a 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -180,12 +180,13 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- BLIS implementation query (level-3) -------------------------------------- char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } +char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } -char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HERK, dt ); } -char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HER2K, dt ); } +char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } -char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYRK, dt ); } -char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYR2K, dt ); } +char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index d900ca4f5..99c7d000d 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -91,6 +91,7 @@ BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index da7643eb6..95587e4a7 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -266,7 +266,7 @@ void bli_acquire_mpart_mdim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -274,10 +274,10 @@ void bli_acquire_mpart_mdim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -489,7 +489,7 @@ void bli_acquire_mpart_ndim // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then we might need to modify some of the subpartition's // properties, depending on its structure type. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -497,10 +497,10 @@ void bli_acquire_mpart_ndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -742,7 +742,7 @@ void bli_acquire_mpart_mndim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && req_part != BLIS_SUBPART00 && req_part != BLIS_SUBPART11 && req_part != BLIS_SUBPART22 ) @@ -762,10 +762,10 @@ void bli_acquire_mpart_mndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 9ebd47de1..6dc4f9141 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -678,7 +678,7 @@ siz_t bli_thread_range_mdim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } @@ -737,7 +737,7 @@ siz_t bli_thread_range_ndim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_gemmt_small.c similarity index 99% rename from kernels/zen/3/bli_syrk_small.c rename to kernels/zen/3/bli_gemmt_small.c index 23d47298c..f2fd88de7 100644 --- a/kernels/zen/3/bli_syrk_small.c +++ b/kernels/zen/3/bli_gemmt_small.c @@ -52,9 +52,9 @@ static float C_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); static double D_C_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); -#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. -#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR). -static err_t bli_ssyrk_small +#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. +#define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR). +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -65,7 +65,7 @@ static err_t bli_ssyrk_small cntl_t* cntl ); -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -76,7 +76,7 @@ static err_t bli_dsyrk_small cntl_t* cntl ); -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -87,7 +87,7 @@ static err_t bli_ssyrk_small_atbn cntl_t* cntl ); -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -98,11 +98,11 @@ static err_t bli_dsyrk_small_atbn cntl_t* cntl ); /* -* The bli_syrk_small function will use the +* The bli_gemmt_small function will use the * custom MRxNR kernels, to perform the computation. * The custom kernels are used if the [M * N] < 240 * 240 */ -err_t bli_syrk_small +err_t bli_gemmt_small ( obj_t* alpha, obj_t* a, @@ -113,20 +113,20 @@ err_t bli_syrk_small cntl_t* cntl ) { - // FGVZ: This code was originally in bli_syrk_front(). However, it really - // fits more naturally here within the bli_syrk_small() function. This + // FGVZ: This code was originally in bli_gemmt_front(). However, it really + // fits more naturally here within the bli_gemmt_small() function. This // becomes a bit more obvious now that the code is here, as it contains - // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_SYRK, which are specific + // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific // to this implementation. if ( bli_obj_has_trans( a ) ) { // Continue with small implementation. ; } - else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) || - ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) ) + else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) || + ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ) { // Continue with small implementation. ; @@ -162,11 +162,11 @@ err_t bli_syrk_small { if (dt == BLIS_FLOAT) { - return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } else if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } } @@ -175,19 +175,19 @@ err_t bli_syrk_small if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl); } if (dt == BLIS_FLOAT) { - return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl); } return BLIS_NOT_YET_IMPLEMENTED; }; -static err_t bli_ssyrk_small +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -240,7 +240,7 @@ static err_t bli_ssyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -1584,7 +1584,7 @@ static err_t bli_ssyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy in case of beta = 0 @@ -1673,7 +1673,7 @@ static err_t bli_ssyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); @@ -1703,11 +1703,11 @@ static err_t bli_ssyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 8; _l++; } @@ -1729,8 +1729,8 @@ static err_t bli_ssyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -1747,7 +1747,7 @@ static err_t bli_ssyrk_small } } } - + return BLIS_SUCCESS; } else @@ -1756,7 +1756,7 @@ static err_t bli_ssyrk_small }; -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -1810,7 +1810,7 @@ static err_t bli_dsyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -3154,7 +3154,7 @@ static err_t bli_dsyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy for beta = 0 @@ -3195,7 +3195,7 @@ static err_t bli_dsyrk_small { ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 4; _l++; } @@ -3243,7 +3243,7 @@ static err_t bli_dsyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); @@ -3273,7 +3273,7 @@ static err_t bli_dsyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3299,8 +3299,8 @@ static err_t bli_dsyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3317,7 +3317,7 @@ static err_t bli_dsyrk_small } } } - + return BLIS_SUCCESS; } else @@ -3326,7 +3326,7 @@ static err_t bli_dsyrk_small }; -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3364,7 +3364,7 @@ static err_t bli_ssyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3715,7 +3715,7 @@ static err_t bli_ssyrk_small_atbn } } } - + //copy/compute sryk values back to C if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C { @@ -3774,7 +3774,7 @@ static err_t bli_ssyrk_small_atbn return BLIS_NONCONFORMAL_DIMENSIONS; } -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3812,7 +3812,7 @@ static err_t bli_dsyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3968,7 +3968,7 @@ static err_t bli_dsyrk_small_atbn result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; - + tC += ldc; ymm6 = _mm256_hadd_pd(ymm6, ymm6); _mm256_storeu_pd(scratch, ymm6); @@ -4199,7 +4199,7 @@ static err_t bli_dsyrk_small_atbn } } } - + return BLIS_SUCCESS; } else diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c index bd6c2647e..369017338 100644 --- a/sandbox/gemmlike/bls_gemm_check.c +++ b/sandbox/gemmlike/bls_gemm_check.c @@ -99,11 +99,6 @@ void bls_gemm_check e_val = bli_check_object_buffer( c ); bli_check_error_code( e_val ); - // Check for sufficiently sized stack buffers - - e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx ); - bli_check_error_code( e_val ); - // Check object dimensions. e_val = bli_check_level3_dims( a, b, c );