diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 533a6dcaf..d3b11c43d 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -86,7 +86,10 @@ void bli_gemm_front bli_cntx_set_family( BLIS_GEMM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator diff --git a/frame/3/gemm/old/bli_gemm_thread.c b/frame/3/gemm/old/bli_gemm_thread.c index abbaf5508..2684fcaf8 100644 --- a/frame/3/gemm/old/bli_gemm_thread.c +++ b/frame/3/gemm/old/bli_gemm_thread.c @@ -40,12 +40,12 @@ thrinfo_t** bli_gemm_thrinfo_create_paths( void ) { #ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); + dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT", 1 ); +// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT", 1 ); dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); + dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT", 1 ); + dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT", 1 ); + dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT", 1 ); #else dim_t jc_way = 1; dim_t kc_way = 1; diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 8bede097b..340aa7edc 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -93,7 +93,10 @@ void bli_hemm_front bli_cntx_set_family( BLIS_GEMM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 7350b5785..c6851d2a4 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -111,7 +111,10 @@ void bli_her2k_front bli_cntx_set_family( BLIS_HERK, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke herk twice, using beta only the first time. diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 7fcd2d356..642be0d99 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -91,7 +91,10 @@ void bli_herk_front bli_cntx_set_family( BLIS_HERK, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index cd2f3a20e..57aa11f73 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -92,7 +92,10 @@ void bli_symm_front bli_cntx_set_family( BLIS_GEMM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 47ce91795..f64a765e5 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -92,7 +92,10 @@ void bli_syr2k_front bli_cntx_set_family( BLIS_HERK, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke herk twice, using beta only the first time. diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index f037eb1c1..42d135659 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -85,7 +85,10 @@ void bli_syrk_front bli_cntx_set_family( BLIS_HERK, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index c7231c839..b44ddfcff 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -135,7 +135,10 @@ void bli_trmm_front bli_cntx_set_family( BLIS_TRMM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index cf97bbcf2..e672f7af3 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -134,7 +134,10 @@ void bli_trmm3_front bli_cntx_set_family( BLIS_TRMM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index ceab957a6..c38a193f8 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -124,7 +124,10 @@ void bli_trsm_front bli_cntx_set_family( BLIS_TRSM, cntx ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx ); + bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 31e995e1b..4a1da9ae1 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -694,23 +694,56 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c, bli_cntx_set_schema_c( schema_c, cntx ); } -void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) +void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, + dim_t m, dim_t n, dim_t k ) { dim_t jc, pc, ic, jr, ir; #ifdef BLIS_ENABLE_MULTITHREADING - jc = bli_env_read_nway( "BLIS_JC_NT" ); - //pc = bli_env_read_nway( "BLIS_KC_NT" ); + + int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 ); + + if ( nthread == -1 ) + nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 ); + + if ( nthread < 1 ) nthread = 1; + + bli_partition_2x2( nthread, m*BLIS_DEFAULT_M_THREAD_RATIO, + n*BLIS_DEFAULT_N_THREAD_RATIO, &ic, &jc ); + + for ( ir = BLIS_DEFAULT_MR_THREAD_MAX ; ir > 1 ; ir-- ) + { + if ( ic % ir == 0 ) + { + ic /= ir; + break; + } + } + + for ( jr = BLIS_DEFAULT_NR_THREAD_MAX ; jr > 1 ; jr-- ) + { + if ( jc % jr == 0 ) + { + jc /= jr; + break; + } + } + + jc = bli_env_read_nway( "BLIS_JC_NT", jc ); + //pc = bli_env_read_nway( "BLIS_KC_NT", 1 ); pc = 1; - ic = bli_env_read_nway( "BLIS_IC_NT" ); - jr = bli_env_read_nway( "BLIS_JR_NT" ); - ir = bli_env_read_nway( "BLIS_IR_NT" ); + ic = bli_env_read_nway( "BLIS_IC_NT", ic ); + jr = bli_env_read_nway( "BLIS_JR_NT", jr ); + ir = bli_env_read_nway( "BLIS_IR_NT", ir ); + #else + jc = 1; pc = 1; ic = 1; jr = 1; ir = 1; + #endif if ( l3_op == BLIS_TRMM ) @@ -750,7 +783,7 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) ( 1, 1, - jc * ic * jr, + ic * pc * jc * ic * jr, 1, 1, cntx @@ -763,7 +796,7 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) 1, 1, 1, - ic * jr * ir, + ic * pc * jc * jr * ir, 1, cntx ); diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 6aed68111..8cfd67750 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -436,7 +436,10 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c, cntx_t* cntx ); void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, - cntx_t* cntx ); + cntx_t* cntx, + dim_t m, + dim_t n, + dim_t k ); // other query functions diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 688ec134d..0adfd050f 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -1347,6 +1347,26 @@ #endif +// -- Define default threading parameters -------------------------------------- + + +#ifndef BLIS_DEFAULT_M_THREAD_RATIO +#define BLIS_DEFAULT_M_THREAD_RATIO 2 +#endif + +#ifndef BLIS_DEFAULT_N_THREAD_RATIO +#define BLIS_DEFAULT_N_THREAD_RATIO 1 +#endif + +#ifndef BLIS_DEFAULT_MR_THREAD_MAX +#define BLIS_DEFAULT_MR_THREAD_MAX 1 +#endif + +#ifndef BLIS_DEFAULT_NR_THREAD_MAX +#define BLIS_DEFAULT_NR_THREAD_MAX 3 +#endif + + // -- Kernel blocksize checks -------------------------------------------------- // Verify that cache blocksizes are whole multiples of register blocksizes. diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 30614ff73..37ec94292 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -958,10 +958,208 @@ siz_t bli_thread_get_range_weighted_b2t // ----------------------------------------------------------------------------- -// Some utilities -dim_t bli_env_read_nway( const char* env ) +void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors ) { - dim_t num = 1; + factors->n = n; + factors->sqrt_n = (dim_t)sqrt(n); + factors->f = 2; +} + +dim_t bli_next_prime_factor( bli_prime_factors_t* factors ) +{ + // Return the prime factorization of the original number n one-by-one. + // Return 1 after all factors have been exhausted. + + // Looping over possible factors in increasing order assures we will + // only return prime factors (a la the Sieve of Eratosthenes). + while ( factors->f <= factors->sqrt_n ) + { + // Special cases for factors 2-7 handle all numbers not divisible by 11 + // or another larger prime. The slower loop version is used after that. + // If you use a number of threads with large prime factors you get + // what you deserve. + if ( factors->f == 2 ) + { + if ( factors->n % 2 == 0 ) + { + factors->n /= 2; + return 2; + } + factors->f = 3; + } + else if ( factors->f == 3 ) + { + if ( factors->n % 3 == 0 ) + { + factors->n /= 3; + return 3; + } + factors->f = 5; + } + else if ( factors->f == 5 ) + { + if ( factors->n % 5 == 0 ) + { + factors->n /= 5; + return 5; + } + factors->f = 7; + } + else if ( factors->f == 7 ) + { + if ( factors->n % 7 == 0 ) + { + factors->n /= 7; + return 7; + } + factors->f = 11; + } + else + { + if ( factors->n % factors->f == 0 ) + { + factors->n /= factors->f; + return factors->f; + } + factors->f++; + } + } + + // To get here we must be out of prime factors, leaving only n (if it is + // prime) or an endless string of 1s. + dim_t tmp = factors->n; + factors->n = 1; + return tmp; +} + +void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2, + dim_t* nt1, dim_t* nt2 ) +{ + // Partition a number of threads into two factors nt1 and nt2 such that + // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a + // slower optimal algorithm (which minizes |nt1*work2 - nt2*work1|). + + // Return early small prime numbers of threads + if (nthread < 4) + { + *nt1 = ( work1 >= work2 ? nthread : 1 ); + *nt2 = ( work1 < work2 ? nthread : 1 ); + } + + *nt1 = 1; + *nt2 = 1; + + // Both algorithms need the prime factorization of nthread. + bli_prime_factors_t factors; + bli_prime_factorization( nthread, &factors ); + + #if 1 + + // Fast algorithm: assign prime factors in increasing order to whichever + // partition has more work to do. The work is divided by the number of + // threads assigned at each iteration. This algorithm is sub-optimal, + // for example in the partitioning of 12 with equal work (optimal solution + // is 4x3, this algorithm finds 6x2). + + dim_t f; + while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) + { + if ( work1 > work2 ) + { + work1 /= f; + *nt1 *= f; + } + else + { + work2 /= f; + *nt2 *= f; + } + } + + #else + + // Slow algorithm: exhaustively constructs all factor pairs of nthread and + // chooses the best one. + + // Eight prime factors handles nthread up to 223092870. + dim_t fact[8]; + dim_t mult[8]; + + // There is always at least one prime factor, so use if for initialization. + dim_t nfact = 1; + fact[0] = bli_next_prime_factor( &factors ); + mult[0] = 1; + + // Collect the remaining prime factors, accounting for multiplicity of + // repeated factors. + dim_t f; + while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) + { + if ( f == fact[nfact-1] ) + { + mult[nfact-1]++; + } + else + { + nfact++; + fact[nfact-1] = f; + mult[nfact-1] = 1; + } + } + + // Now loop over all factor pairs. A single factor pair is denoted by how + // many of each prime factor are included in the first factor (ntaken). + dim_t ntake[8] = {0}; + dim_t min_diff = INT_MAX; + + // Loop over how many prime factors to assign to the first factor in the + // pair, for each prime factor. The total number of iterations is + // \Prod_{i=0}^{nfact-1} mult[i]. + bool done = false; + while ( !done ) + { + dim_t x = 1; + dim_t y = 1; + + // Form the factors by integer exponentiation and accumulation. + for (dim_t i = 0 ; i < nfact ; i++ ) + { + x *= bli_ipow( fact[i], ntake[i] ); + y *= bli_ipow( fact[i], mult[i]-ntake[i] ); + } + + // Check if this factor pair is optimal by checking + // |nt1*work2 - nt2*work1|. + dim_t diff = llabs( x*work2 - y*work1 ); + if ( diff < min_diff ) + { + min_diff = diff; + *nt1 = x; + *nt2 = y; + } + + // Go to the next factor pair by doing an "odometer loop". + for ( dim_t i = 0 ; i < nfact ; i++ ) + { + if ( ++ntake[i] > mult[i] ) + { + ntake[i] = 0; + if ( i == nfact-1 ) done = true; + else continue; + } + break; + } + } + + #endif +} + +// ----------------------------------------------------------------------------- + +// Some utilities +dim_t bli_env_read_nway( const char* env, dim_t fallback ) +{ + dim_t num = fallback; char* str = getenv( env ); if ( str != NULL ) @@ -986,3 +1184,16 @@ dim_t bli_lcm( dim_t x, dim_t y) { return x * y / bli_gcd( x, y ); } + +dim_t bli_ipow( dim_t base, dim_t power ) +{ + dim_t p = 1; + + for ( dim_t mask = 0x1 ; mask <= power ; mask <<= 1 ) + { + if ( power & mask ) p *= base; + base *= base; + } + + return p; +} diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 3d32872b5..1998253cf 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -164,10 +164,25 @@ void bli_l3_thread_decorator cntl_t* cntl ); +// Factorization and partitioning prototypes +typedef struct +{ + dim_t n; + dim_t sqrt_n; + dim_t f; +} bli_prime_factors_t; + +void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); + +dim_t bli_next_prime_factor(bli_prime_factors_t* factors); + +void bli_partition_2x2(dim_t nthread, dim_t work1, dim_t work2, dim_t* nt1, dim_t* nt2); + // Miscellaneous prototypes -dim_t bli_env_read_nway( const char* env ); +dim_t bli_env_read_nway( const char* env, dim_t fallback ); dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); +dim_t bli_ipow( dim_t base, dim_t power ); #endif