diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 228f22714..b66190dbc 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,24 +39,28 @@ // gemm -#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) +#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // herk -#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) +#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm -#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) + +#define bli_trmm_my_iter( index, thread ) \ +\ + ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm -#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trsm_my_iter( index, thread ) \ +\ + ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 0c62b69ac..73b8bed06 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_gemm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 6a19e1bdb..3c25d7fa8 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_gemm_blk_var2 bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 1967c6ce4..a0074db24 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -268,14 +269,27 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -290,7 +304,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -300,12 +314,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index 878889d2a..603fc682b 100644 --- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + dim_t jr_inc = jr_num_threads; \ + dim_t ir_inc = ir_num_threads; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ @@ -295,11 +299,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c new file mode 100644 index 000000000..b48f46bc0 --- /dev/null +++ b/frame/3/gemm/other/bli_gemm_ker_var2.c @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); + + +void bli_gemm_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. +#if 1 + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } +#endif + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( gemm_ker_var2 ) + diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 93c014051..d9f71beaa 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -279,17 +280,57 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). For both the + rectangular and triangular regions, use contiguous assignment for the + 1st loop as well. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -304,7 +345,112 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd loop for the remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -317,12 +463,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 5875c3317..862ffe42e 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \ \ /* If there is a zero region to the left of where the diagonal of C intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ if ( diagoffc > 0 ) \ { \ jp = diagoffc / NR; \ @@ -279,17 +282,57 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in the + 2nd loop for the initial triangular region of C (if it exists). For both + the rectangular and triangular regions, use contiguous assignment for the + 1st loop. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -304,7 +347,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -317,12 +360,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -402,6 +445,111 @@ void PASTEMAC(ch,varname) \ } \ } \ } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop for + the remaining triangular region of C. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ } INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c new file mode 100644 index 000000000..bd7b69e81 --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c @@ -0,0 +1,420 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); + + +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd and 1st loops. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/herk/other/bli_herk_l_ker_var2.c new file mode 100644 index 000000000..832421813 --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2.c @@ -0,0 +1,409 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); + + +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c new file mode 100644 index 000000000..398213282 --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c @@ -0,0 +1,420 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); + + +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd and 1st loops. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/herk/other/bli_herk_u_ker_var2.c new file mode 100644 index 000000000..8d1a3021d --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2.c @@ -0,0 +1,409 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); + + +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) + diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 3778c7302..4d6b49a25 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -85,6 +86,10 @@ void bli_trmm_front } #if 0 + // NOTE: This case casts right-side trmm in terms of left side. This + // reduces the number of macrokernels exercised to two (trmm_ll and + // trmm_lu) but can lead to the microkernel being executed with an + // output matrix that is stored counter to its output preference. // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied @@ -98,6 +103,11 @@ void bli_trmm_front } #else + // NOTE: This case computes right-side trmm natively with trmm_rl and + // trmm_ru macrokernels. This code path always gives us the opportunity + // to transpose the entire operation so that the effective storage format + // of the output matrix matches the microkernel's output preference. + // Thus, from a performance perspective, this case is preferred. // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index ff64501aa..22eca7fb1 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -317,29 +318,45 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). For both the + rectangular and triangular regions, use contiguous assignment for the + 1st loop as well. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -369,7 +386,8 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ @@ -379,7 +397,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -433,13 +451,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ @@ -449,7 +467,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -498,17 +516,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index bfe57ba16..c021614a8 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -324,29 +325,45 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). For both the + rectangular and triangular regions, use contiguous assignment for the + 1st loop as well. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -376,7 +393,7 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ @@ -386,7 +403,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -440,13 +457,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ @@ -456,7 +473,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -505,17 +522,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index e2eef964e..645df2944 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -324,15 +325,151 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of B, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. (There should never be any remainder in this division.) The + number of iterations in the triangular (or trapezoidal) region is + computed as the remaining number of iterations in the n dimension. */ \ + n_iter_rct = diagoffb / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of B (if it exists). For both the + rectangular and triangular regions, use contiguous assignment for the + 1st loop as well. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd loop for the remaining triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr loop but skip all but the pointer increment for iterations + that are not assigned to it. */ \ +\ + /* Advance the starting b1 and c1 pointers to the positions corresponding + to the start of the triangular region of B. */ \ + jr_start = n_iter_rct; \ + b1 = b_cast + jr_start * cstep_b; \ + c1 = c_cast + jr_start * cstep_c; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -358,7 +495,6 @@ void PASTEMAC(ch,varname) \ by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ @@ -366,7 +502,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -375,7 +511,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -390,7 +526,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -449,83 +585,6 @@ void PASTEMAC(ch,varname) \ \ b1 += ps_b_cur; \ } \ - else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ - { \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ -\ - /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t - object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ -\ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ - } \ -\ - b1 += cstep_b; \ - } \ \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index c76bc535f..f486f7471 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ - dim_t i, j; \ + dim_t i, j, jb0; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ @@ -324,16 +325,58 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in B. (There should never be any remainder + in this division.) The number of iterations in the rectangular region + is computed as the remaining number of iterations in the n dimension. */ \ + n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd loop for the initial triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr loop but skip all but the pointer increment for iterations + that are not assigned to it. */ \ \ b1 = b_cast; \ c1 = c_cast; \ -\ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = 0; j < n_iter_tri; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -358,7 +401,6 @@ void PASTEMAC(ch,varname) \ by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ @@ -366,7 +408,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -375,7 +417,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -390,7 +432,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -449,30 +491,72 @@ void PASTEMAC(ch,varname) \ \ b1 += ps_b_cur; \ } \ - else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ - { \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ + c1 += cstep_c; \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use contiguous assignment of micropanels to threads in both the 2nd and + 1st loops the remaining triangular region of B. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ + jb0 = n_iter_tri; \ +\ + /* Save the resulting value of b1 from the previous loop since it represents + the starting point for the rectangular region. */ \ + b_cast = b1; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + /* NOTE: We must index through b_cast differently since it contains + the starting address of the rectangular region (which is already + n_iter_tri logical iterations through B). */ \ + b1 = b_cast + (j-jb0) * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ -\ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -517,19 +601,12 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ } \ -\ - c1 += cstep_c; \ } \ \ +\ +\ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c new file mode 100644 index 000000000..fbbbb7b2f --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c @@ -0,0 +1,519 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); + + +void bli_trmm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t off_a1011; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1011 = 0; \ + k_a1011 = bli_min( diagoffa_i + MR, k ); \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c new file mode 100644 index 000000000..2fe01d0e2 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c @@ -0,0 +1,527 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); + + +void bli_trmm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t off_a1112; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely below the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly above the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1112 = diagoffa_i; \ + k_a1112 = k - off_a1112; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c new file mode 100644 index 000000000..860295c4c --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c @@ -0,0 +1,539 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); + + +void bli_trmm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t off_b1121; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to the beginning of the panel that + was packed so we can index into the corresponding location + in A. Then compute the length of that panel. */ \ + off_b1121 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b1121; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c new file mode 100644 index 000000000..e0adf4cf2 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -0,0 +1,539 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); + + +void bli_trmm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t off_b0111; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b0111 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 ) + diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 8b666b3f4..783572944 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_trsm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 6be5965a3..7286ba7e0 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_trsm_blk_var2 bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 34fc6a2b6..a244c4ebb 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -339,25 +340,38 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if( bli_trsm_my_iter( j, thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1 + (0 )*rstep_c; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -409,8 +423,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -474,8 +487,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -531,10 +543,6 @@ void PASTEMAC(ch,varname) \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /* diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 78e2a7a15..6317b298a 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -347,25 +348,38 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if( bli_trsm_my_iter( j, thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1 + (m_iter-1)*rstep_c; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ @@ -419,8 +433,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -484,8 +497,7 @@ void PASTEMAC(ch,varname) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -541,10 +553,6 @@ void PASTEMAC(ch,varname) \ \ c11 -= rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /* diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c new file mode 100644 index 000000000..4e7e1b850 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c @@ -0,0 +1,593 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); + + +void bli_trsm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t k_a10; \ + dim_t off_a10; \ + dim_t off_a11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if( bli_trsm_my_iter( j, thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a10 = 0; \ + k_a1011 = diagoffa_i + MR; \ + k_a10 = k_a1011 - MR; \ + off_a11 = k_a10; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the panel A10 and the triangular + block A11. */ \ + a10 = a1; \ + /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ + a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ +*/ \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ + ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ +*/ \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ + +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ + ( double* )c, 1, cs_c, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ + ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c new file mode 100644 index 000000000..a8978df86 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c @@ -0,0 +1,574 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); + + +void bli_trsm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t k_a11; \ + dim_t k_a12; \ + dim_t off_a11; \ + dim_t off_a12; \ + dim_t i, j, ib; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if( bli_trsm_my_iter( j, thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( ib = 0; ib < m_iter; ++ib ) \ + { \ + i = m_iter - 1 - ib; \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a11 = diagoffa_i; \ + k_a1112 = k - off_a11;; \ + k_a11 = MR; \ + k_a12 = k_a1112 - MR; \ + off_a12 = off_a11 + k_a11; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the triangular block A11 and the + panel A12. */ \ + a11 = a1; \ + /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ + a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ + b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 -= rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ +printf( "m_iter = %lu\n", m_iter ); \ +printf( "m_cur = %lu\n", m_cur ); \ +printf( "k = %lu\n", k ); \ +printf( "diagoffa_i = %lu\n", diagoffa_i ); \ +printf( "off_a1112 = %lu\n", off_a1112 ); \ +printf( "k_a1112 = %lu\n", k_a1112 ); \ +printf( "k_a12 = %lu\n", k_a12 ); \ +printf( "k_a11 = %lu\n", k_a11 ); \ +printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ +printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c new file mode 100644 index 000000000..70b3e456d --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c @@ -0,0 +1,591 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); + + +void bli_trsm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while + the current macro-kernel targets the "rl" case (right-side/lower- + triangular), it becomes upper-triangular after the kernel operation + is transposed so that all kernel instances are of the "left" + variety (since those are the only trsm ukernels that exist). */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t k_b11; \ + dim_t k_b21; \ + dim_t off_b11; \ + dim_t off_b21; \ + dim_t i, j, jb; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKNR + pd_a == NR + ps_a == stride to next micro-panel of A + rs_b == PACKMR + cs_b == 1 + pd_b == MR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + + Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the + swapping of values in the control tree (ie: those values used when + packing). This swapping is needed since we cast right-hand trsm in + terms of transposed left-hand trsm. So, if we're going to be + transposing the operation, then A needs to be packed with NR and B + needs to be packed with MR (remember: B is the triangular matrix in + the right-hand side parameter case). + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of NR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it so that + we can index to the correct place in C (corresponding to the + part of the panel of B that was packed). + NOTE: This is NOT being done to skip over "no-op" iterations, + as with the trsm_lu macro-kernel. This MUST be done for correct + execution because we use n (via n_iter) to compute diagonal and + index offsets for backwards movement through B. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of NR. If k + isn't a multiple of NR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an NR x NR triangular solve. + This adjustment of k is consistent with what happened when B was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of A. */ \ + if ( k % NR != 0 ) k += NR - ( k % NR ); \ +\ + /* NOTE: We don't need to check that n is a multiple of PACKNR since we + know that the underlying buffer was already allocated to have an n + dimension that is a multiple of PACKNR, with the region between the + last column and the next multiple of NR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_schema_a( schema_b, &aux ); \ + bli_auxinfo_set_schema_b( schema_a, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( jb = 0; jb < n_iter; ++jb ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict b2; \ +\ + j = n_iter - 1 - jb; \ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ + a1 = a_cast; \ + c11 = c1 + (n_iter-1)*cstep_c; \ +\ + n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of B resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b11 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b11; \ + k_b11 = NR; \ + k_b21 = k_b1121 - NR; \ + off_b21 = off_b11 + k_b11; \ +\ + /* Compute the addresses of the triangular block B11 and the + panel B21. */ \ + b11 = b1; \ + /* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \ + b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \ +\ + /* Compute the panel stride for the current micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the A11 block and A12 panel. */ \ + a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ + a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + ps_b_cur; \ + if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b21, \ + alpha1_cast, \ + b21, \ + b11, \ + a12, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b21, \ + alpha1_cast, \ + b21, \ + b11, \ + a12, \ + a11, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + zero, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 -= cstep_c; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c new file mode 100644 index 000000000..289bb5d9f --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c @@ -0,0 +1,584 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); + + +void bli_trsm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + /* NOTE: We use the lower-triangular gemmtrsm ukernel because, while + the current macro-kernel targets the "ru" case (right-side/upper- + triangular), it becomes lower-triangular after the kernel operation + is transposed so that all kernel instances are of the "left" + variety (since those are the only trsm ukernels that exist). */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t k_b01; \ + dim_t off_b01; \ + dim_t off_b11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKNR + pd_a == NR + ps_a == stride to next micro-panel of A + rs_b == PACKMR + cs_b == 1 + pd_b == MR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + + Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the + swapping of values in the control tree (ie: those values used when + packing). This swapping is needed since we cast right-hand trsm in + terms of transposed left-hand trsm. So, if we're going to be + transposing the operation, then A needs to be packed with NR and B + needs to be packed with MR (remember: B is the triangular matrix in + the right-hand side parameter case). + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of NR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of NR. If k + isn't a multiple of NR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an NR x NR triangular solve. + This adjustment of k is consistent with what happened when B was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of A. */ \ + if ( k % NR != 0 ) k += NR - ( k % NR ); \ +\ + /* NOTE: We don't need to check that n is a multiple of PACKNR since we + know that the underlying buffer was already allocated to have an n + dimension that is a multiple of PACKNR, with the region between the + last column and the next multiple of NR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_schema_a( schema_b, &aux ); \ + bli_auxinfo_set_schema_b( schema_a, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of B resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b01 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ + k_b01 = k_b0111 - NR; \ + off_b11 = k_b01; \ +\ + /* Compute the addresses of the panel B10 and the triangular + block B11. */ \ + b01 = b1; \ + /* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \ + b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \ +\ + /* Compute the panel stride for the current micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the A10 panel and A11 block. */ \ + a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \ + a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + ps_b_cur; \ + if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b01, \ + alpha1_cast, \ + b01, \ + b11, \ + a10, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b01, \ + alpha1_cast, \ + b01, \ + b11, \ + a10, \ + a11, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + zero, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 ) + diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c index 9b5803d9f..1f40933b0 100644 --- a/frame/base/bli_prune.c +++ b/frame/base/bli_prune.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, // matrix is empty. This is not strictly needed but rather a minor // optimization, as it would prevent threads that would otherwise get // subproblems on BLIS_ZEROS operands from calling the macro-kernel, - // because bli_thread_get_range*() would return empty ranges, which would + // because bli_thread_range*() would return empty ranges, which would // cause the variant's for loop from executing any iterations. // NOTE: this should only ever execute if the primary object is // triangular because that is the only structure type with subpartitions diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index eb92f08b0..613a293e8 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } +static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) +{ + return ( bool_t ) + ( bli_is_strictly_above_diag_n( diagoff, m, n ) || + bli_is_strictly_below_diag_n( diagoff, m, n ) ); +} + static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool_t ) @@ -784,10 +792,14 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) ( i != 0 || n_left == 0 ); } -static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth ) +static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool_t ) - ( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) ); +#ifdef BLIS_JRIR_INTERLEAVE + ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); +#else + ( i == end_iter - 1 ); +#endif } diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 2931d0951..886dc15f5 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -59,9 +59,35 @@ void bli_thread_finalize( void ) { } +// ----------------------------------------------------------------------------- +#if 0 +void bli_thread_range_jrir + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ +//#ifdef BLIS_JRIR_INTERLEAVE +#if 1 + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = bli_thread_n_way( thread ); + *end = n; +#else + // Use contiguous slab partitioning for jr/ir loops. + bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); + *inc = 1; +#endif +} +#endif // ----------------------------------------------------------------------------- -void bli_thread_get_range_sub +void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, @@ -72,6 +98,9 @@ void bli_thread_get_range_sub ) { dim_t n_way = bli_thread_n_way( thread ); + + if ( n_way == 1 ) { *start = 0; *end = n; return; } + dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; @@ -202,7 +231,7 @@ void bli_thread_get_range_sub } } -siz_t bli_thread_get_range_l2r +siz_t bli_thread_range_l2r ( thrinfo_t* thr, obj_t* a, @@ -216,13 +245,13 @@ siz_t bli_thread_get_range_l2r dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, n, bf, - FALSE, start, end ); + bli_thread_range_sub( thr, n, bf, + FALSE, start, end ); return m * ( *end - *start ); } -siz_t bli_thread_get_range_r2l +siz_t bli_thread_range_r2l ( thrinfo_t* thr, obj_t* a, @@ -236,13 +265,13 @@ siz_t bli_thread_get_range_r2l dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, n, bf, - TRUE, start, end ); + bli_thread_range_sub( thr, n, bf, + TRUE, start, end ); return m * ( *end - *start ); } -siz_t bli_thread_get_range_t2b +siz_t bli_thread_range_t2b ( thrinfo_t* thr, obj_t* a, @@ -256,13 +285,13 @@ siz_t bli_thread_get_range_t2b dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, m, bf, - FALSE, start, end ); + bli_thread_range_sub( thr, m, bf, + FALSE, start, end ); return n * ( *end - *start ); } -siz_t bli_thread_get_range_b2t +siz_t bli_thread_range_b2t ( thrinfo_t* thr, obj_t* a, @@ -276,15 +305,15 @@ siz_t bli_thread_get_range_b2t dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, m, bf, - TRUE, start, end ); + bli_thread_range_sub( thr, m, bf, + TRUE, start, end ); return n * ( *end - *start ); } // ----------------------------------------------------------------------------- -dim_t bli_thread_get_range_width_l +dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, @@ -495,17 +524,17 @@ siz_t bli_find_area_trap_l // ----------------------------------------------------------------------------- -siz_t bli_thread_get_range_weighted_sub +siz_t bli_thread_range_weighted_sub ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr + thrinfo_t* restrict thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* restrict j_start_thr, + dim_t* restrict j_end_thr ) { dim_t n_way = bli_thread_n_way( thread ); @@ -570,7 +599,7 @@ siz_t bli_thread_get_range_weighted_sub // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = - bli_thread_get_range_width_l + bli_thread_range_width_l ( diagoff_j, m, n_left, j, n_way, @@ -614,7 +643,7 @@ siz_t bli_thread_get_range_weighted_sub bli_toggle_bool( &handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_get_range_weighted_sub + area = bli_thread_range_weighted_sub ( thread, diagoff, uplo, m, n, bf, handle_edge_low, @@ -632,7 +661,7 @@ siz_t bli_thread_get_range_weighted_sub return area; } -siz_t bli_thread_get_range_mdim +siz_t bli_thread_range_mdim ( dir_t direct, thrinfo_t* thr, @@ -678,20 +707,20 @@ siz_t bli_thread_get_range_mdim if ( use_weighted ) { if ( direct == BLIS_FWD ) - return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end ); + return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); else - return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end ); + return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) - return bli_thread_get_range_t2b( thr, x, bmult, start, end ); + return bli_thread_range_t2b( thr, x, bmult, start, end ); else - return bli_thread_get_range_b2t( thr, x, bmult, start, end ); + return bli_thread_range_b2t( thr, x, bmult, start, end ); } } -siz_t bli_thread_get_range_ndim +siz_t bli_thread_range_ndim ( dir_t direct, thrinfo_t* thr, @@ -737,20 +766,20 @@ siz_t bli_thread_get_range_ndim if ( use_weighted ) { if ( direct == BLIS_FWD ) - return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); + return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); else - return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); + return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) - return bli_thread_get_range_l2r( thr, x, bmult, start, end ); + return bli_thread_range_l2r( thr, x, bmult, start, end ); else - return bli_thread_get_range_r2l( thr, x, bmult, start, end ); + return bli_thread_range_r2l( thr, x, bmult, start, end ); } } -siz_t bli_thread_get_range_weighted_l2r +siz_t bli_thread_range_weighted_l2r ( thrinfo_t* thr, obj_t* a, @@ -782,7 +811,7 @@ siz_t bli_thread_get_range_weighted_l2r } area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end @@ -790,7 +819,7 @@ siz_t bli_thread_get_range_weighted_l2r } else // if dense or zeros { - area = bli_thread_get_range_l2r + area = bli_thread_range_l2r ( thr, a, bmult, start, end @@ -800,7 +829,7 @@ siz_t bli_thread_get_range_weighted_l2r return area; } -siz_t bli_thread_get_range_weighted_r2l +siz_t bli_thread_range_weighted_r2l ( thrinfo_t* thr, obj_t* a, @@ -834,7 +863,7 @@ siz_t bli_thread_get_range_weighted_r2l bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end @@ -842,7 +871,7 @@ siz_t bli_thread_get_range_weighted_r2l } else // if dense or zeros { - area = bli_thread_get_range_r2l + area = bli_thread_range_r2l ( thr, a, bmult, start, end @@ -852,7 +881,7 @@ siz_t bli_thread_get_range_weighted_r2l return area; } -siz_t bli_thread_get_range_weighted_t2b +siz_t bli_thread_range_weighted_t2b ( thrinfo_t* thr, obj_t* a, @@ -886,7 +915,7 @@ siz_t bli_thread_get_range_weighted_t2b bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end @@ -894,7 +923,7 @@ siz_t bli_thread_get_range_weighted_t2b } else // if dense or zeros { - area = bli_thread_get_range_t2b + area = bli_thread_range_t2b ( thr, a, bmult, start, end @@ -904,7 +933,7 @@ siz_t bli_thread_get_range_weighted_t2b return area; } -siz_t bli_thread_get_range_weighted_b2t +siz_t bli_thread_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, @@ -939,7 +968,7 @@ siz_t bli_thread_get_range_weighted_b2t bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - area = bli_thread_get_range_weighted_sub + area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end @@ -947,7 +976,7 @@ siz_t bli_thread_get_range_weighted_b2t } else // if dense or zeros { - area = bli_thread_get_range_b2t + area = bli_thread_range_b2t ( thr, a, bmult, start, end diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index ffed93106..8f065bb90 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -56,7 +57,21 @@ void bli_thread_finalize( void ); #endif // Thread range-related prototypes. -void bli_thread_get_range_sub +#if 0 +void bli_thread_range_jrir + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ); +#endif +// ----------------------------------------------------------------------------- + +void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, @@ -82,8 +97,8 @@ siz_t PASTEMAC0( opname ) \ dim_t* end \ ); -GENPROT( thread_get_range_mdim ) -GENPROT( thread_get_range_ndim ) +GENPROT( thread_range_mdim ) +GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ @@ -97,18 +112,18 @@ siz_t PASTEMAC0( opname ) \ dim_t* end \ ); -GENPROT( thread_get_range_l2r ) -GENPROT( thread_get_range_r2l ) -GENPROT( thread_get_range_t2b ) -GENPROT( thread_get_range_b2t ) +GENPROT( thread_range_l2r ) +GENPROT( thread_range_r2l ) +GENPROT( thread_range_t2b ) +GENPROT( thread_range_b2t ) -GENPROT( thread_get_range_weighted_l2r ) -GENPROT( thread_get_range_weighted_r2l ) -GENPROT( thread_get_range_weighted_t2b ) -GENPROT( thread_get_range_weighted_b2t ) +GENPROT( thread_range_weighted_l2r ) +GENPROT( thread_range_weighted_r2l ) +GENPROT( thread_range_weighted_t2b ) +GENPROT( thread_range_weighted_b2t ) -dim_t bli_thread_get_range_width_l +dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, @@ -126,17 +141,17 @@ siz_t bli_find_area_trap_l dim_t n, doff_t diagoff ); -siz_t bli_thread_get_range_weighted_sub +siz_t bli_thread_range_weighted_sub ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr + thrinfo_t* restrict thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* restrict j_start_thr, + dim_t* restrict j_end_thr ); @@ -215,5 +230,112 @@ void bli_thread_init_rntm( rntm_t* rntm ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); +// ----------------------------------------------------------------------------- + +//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc ); + +static void bli_thread_range_jrir_rr + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = bli_thread_n_way( thread ); + *end = n; +} + +static void bli_thread_range_jrir_sl + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use contiguous slab partitioning of jr/ir loops. + bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); + *inc = 1; +} + +static void bli_thread_range_jrir + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ +//#ifdef BLIS_JRIR_INTERLEAVE +#if 0 + bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); +#else + bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); +#endif +} + +static void bli_thread_range_weighted_jrir + ( + thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ +#ifdef BLIS_JRIR_INTERLEAVE + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = bli_thread_n_way( thread ); + *end = n; +#else + // Use contiguous slab partitioning for jr/ir loops. + bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, + handle_edge_low, start, end ); + + *start = *start / bf; *inc = 1; + + if ( *end % bf ) *end = *end / bf + 1; + else *end = *end / bf; + +#endif + +#if 0 + const dim_t n_way = bli_thread_n_way( thread ); + + if ( m * n / n_way > 25000 ) + { + // Use contiguous slab partitioning for jr/ir loops. + bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, + handle_edge_low, start, end ); + *inc = 1; + } + else + { + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = n_way; //bli_thread_n_way( thread ); + *end = n; + } +#endif +} + #endif diff --git a/sandbox/ref99/blx_gemm_int.c b/sandbox/ref99/blx_gemm_int.c index 4937095a9..febb8040a 100644 --- a/sandbox/ref99/blx_gemm_int.c +++ b/sandbox/ref99/blx_gemm_int.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -46,10 +47,10 @@ void blx_gemm_int thrinfo_t* thread ) { - obj_t a_local; - obj_t b_local; - obj_t c_local; - gemm_voft f; + obj_t a_local; + obj_t b_local; + obj_t c_local; + gemm_var_oft f; // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); diff --git a/sandbox/ref99/vars/blx_gemm_blk_var1.c b/sandbox/ref99/vars/blx_gemm_blk_var1.c index 43eb40bef..70482ede1 100644 --- a/sandbox/ref99/vars/blx_gemm_blk_var1.c +++ b/sandbox/ref99/vars/blx_gemm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,7 +53,7 @@ void blx_gemm_blk_var1 dim_t my_start, my_end; // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( BLIS_FWD, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/sandbox/ref99/vars/blx_gemm_blk_var2.c b/sandbox/ref99/vars/blx_gemm_blk_var2.c index debcb2dfc..00a19ceef 100644 --- a/sandbox/ref99/vars/blx_gemm_blk_var2.c +++ b/sandbox/ref99/vars/blx_gemm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,7 +53,7 @@ void blx_gemm_blk_var2 dim_t my_start, my_end; // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( BLIS_FWD, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c index 2a1cbe6b6..a4d37409e 100644 --- a/sandbox/ref99/vars/blx_gemm_ker_var2.c +++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -255,14 +256,27 @@ void PASTECH2(blx_,ch,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -277,7 +291,7 @@ void PASTECH2(blx_,ch,varname) \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -287,12 +301,12 @@ void PASTECH2(blx_,ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 3c2a52124..e91b100b2 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2018, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -200,13 +201,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=40 \ +PDEF_ST := -DP_BEGIN=96 \ -DP_END=2000 \ - -DP_INC=40 + -DP_INC=96 -PDEF_MT := -DP_BEGIN=200 \ - -DP_END=10000 \ - -DP_INC=200 +PDEF_MT := -DP_BEGIN=192 \ + -DP_END=3000 \ + -DP_INC=192 @@ -226,9 +227,6 @@ all-mt: blis-mt openblas-mt mkl-mt blis-st: blis-gemm-st blis-mt: blis-gemm-mt -blis-nat-st: blis-gemm-nat-st -blis-nat-mt: blis-gemm-nat-mt - openblas-st: openblas-gemm-st openblas-mt: openblas-gemm-mt @@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \ blis-gemm-mt: blis-gemm-nat-mt \ blis-gemm-ind-mt +blis-nat-st: \ + test_sgemm_asm_blis_st.x \ + test_dgemm_asm_blis_st.x \ + test_cgemm_asm_blis_st.x \ + test_zgemm_asm_blis_st.x \ + test_sherk_asm_blis_st.x \ + test_dherk_asm_blis_st.x \ + test_cherk_asm_blis_st.x \ + test_zherk_asm_blis_st.x \ + test_strmm_asm_blis_st.x \ + test_dtrmm_asm_blis_st.x \ + test_ctrmm_asm_blis_st.x \ + test_ztrmm_asm_blis_st.x \ + test_strsm_asm_blis_st.x \ + test_dtrsm_asm_blis_st.x \ + test_ctrsm_asm_blis_st.x \ + test_ztrsm_asm_blis_st.x + +blis-nat-mt: \ + test_sgemm_asm_blis_mt.x \ + test_dgemm_asm_blis_mt.x \ + test_cgemm_asm_blis_mt.x \ + test_zgemm_asm_blis_mt.x \ + test_sherk_asm_blis_mt.x \ + test_dherk_asm_blis_mt.x \ + test_cherk_asm_blis_mt.x \ + test_zherk_asm_blis_mt.x \ + test_strmm_asm_blis_mt.x \ + test_dtrmm_asm_blis_mt.x \ + test_ctrmm_asm_blis_mt.x \ + test_ztrmm_asm_blis_mt.x \ + test_strsm_asm_blis_mt.x \ + test_dtrsm_asm_blis_mt.x \ + test_ctrsm_asm_blis_mt.x \ + test_ztrsm_asm_blis_mt.x + blis-gemm-nat-st: \ test_sgemm_asm_blis_st.x \ test_dgemm_asm_blis_st.x \ @@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ # blis asm -test_d%_asm_blis_st.o: test_%.c +test_d%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_s%_asm_blis_st.o: test_%.c +test_s%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_z%_asm_blis_st.o: test_%.c +test_z%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_c%_asm_blis_st.o: test_%.c +test_c%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_d%_asm_blis_mt.o: test_%.c +test_d%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_s%_asm_blis_mt.o: test_%.c +test_s%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_z%_asm_blis_mt.o: test_%.c +test_z%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_c%_asm_blis_mt.o: test_%.c +test_c%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ # openblas diff --git a/test/3m4m/test_herk.c b/test/3m4m/test_herk.c new file mode 100644 index 000000000..66a057a59 --- /dev/null +++ b/test/3m4m/test_herk.c @@ -0,0 +1,314 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + ind_t ind; + num_t dt, dt_real; + char dt_ch; + int r, n_repeats; + uplo_t uploc; + trans_t transa; + f77_char f77_uploc; + f77_char f77_transa; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + dt_real = bli_dt_proj_to_real( DT ); + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + k_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + uploc = BLIS_LOWER; + transa = BLIS_NO_TRANSPOSE; + + bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt_real, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + if ( bli_does_trans( transa ) ) + bli_obj_create( dt, k, m, 0, 0, &a ); + else + bli_obj_create( dt, m, k, 0, 0, &a ); + bli_obj_create( dt, m, m, 0, 0, &c ); + //bli_obj_create( dt, m, k, 2, 2*m, &a ); + //bli_obj_create( dt, k, n, 2, 2*k, &b ); + //bli_obj_create( dt, m, n, 2, 2*m, &c ); + bli_obj_create( dt, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, &c ); + bli_obj_set_uplo( uploc, &c ); + + bli_obj_set_conjtrans( transa, &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_herk( &alpha, + &a, + &beta, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); + + ssyrk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); + + dsyrk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); + + cherk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); + + zherk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/3m4m/test_trmm.c b/test/3m4m/test_trmm.c new file mode 100644 index 000000000..06ed38539 --- /dev/null +++ b/test/3m4m/test_trmm.c @@ -0,0 +1,328 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + side_t side; + uplo_t uploa; + trans_t transa; + diag_t diaga; + f77_char f77_side; + f77_char f77_uploa; + f77_char f77_transa; + f77_char f77_diaga; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + +#if 0 + side = BLIS_LEFT; +#else + side = BLIS_RIGHT; +#endif +#if 0 + uploa = BLIS_LOWER; +#else + uploa = BLIS_UPPER; +#endif + transa = BLIS_NO_TRANSPOSE; + diaga = BLIS_NONUNIT_DIAG; + + bli_param_map_blis_to_netlib_side( side, &f77_side ); + bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &a ); + else + bli_obj_create( dt, n, n, 0, 0, &a ); + bli_obj_create( dt, m, n, 0, 0, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); + bli_obj_set_uplo( uploa, &a ); + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_diag( diaga, &a ); + + bli_randm( &a ); + bli_mktrim( &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_trmm( side, + &alpha, + &a, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* cp = bli_obj_buffer( &c ); + + strmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* cp = bli_obj_buffer( &c ); + + dtrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* cp = bli_obj_buffer( &c ); + + ctrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* cp = bli_obj_buffer( &c ); + + ztrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/3m4m/test_trsm.c b/test/3m4m/test_trsm.c new file mode 100644 index 000000000..f417a5361 --- /dev/null +++ b/test/3m4m/test_trsm.c @@ -0,0 +1,338 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c, d; + obj_t c_save; + obj_t alpha; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + side_t side; + uplo_t uploa; + trans_t transa; + diag_t diaga; + f77_char f77_side; + f77_char f77_uploa; + f77_char f77_transa; + f77_char f77_diaga; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + +#if 0 + side = BLIS_LEFT; +#else + side = BLIS_RIGHT; +#endif +#if 0 + uploa = BLIS_LOWER; +#else + uploa = BLIS_UPPER; +#endif + transa = BLIS_NO_TRANSPOSE; + diaga = BLIS_NONUNIT_DIAG; + + bli_param_map_blis_to_netlib_side( side, &f77_side ); + bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &a ); + else + bli_obj_create( dt, n, n, 0, 0, &a ); + bli_obj_create( dt, m, n, 0, 0, &c ); + //bli_obj_create( dt, m, n, n, 1, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &d ); + else + bli_obj_create( dt, n, n, 0, 0, &d ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); + bli_obj_set_uplo( uploa, &a ); + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_diag( diaga, &a ); + + bli_randm( &a ); + bli_mktrim( &a ); + + bli_setd( &BLIS_TWO, &d ); + bli_addd( &d, &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_trsm( side, + &alpha, + &a, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* cp = bli_obj_buffer( &c ); + + strsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* cp = bli_obj_buffer( &c ); + + dtrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* cp = bli_obj_buffer( &c ); + + ctrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* cp = bli_obj_buffer( &c ); + + ztrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + bli_obj_free( &d ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/thread_ranges/test_ranges.c b/test/thread_ranges/test_ranges.c index 68ffe7fec..9bf293ca5 100644 --- a/test/thread_ranges/test_ranges.c +++ b/test/thread_ranges/test_ranges.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -290,13 +291,13 @@ int main( int argc, char** argv ) thrinfo.work_id = t; if ( part_n_dim && go_fwd ) - area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end ); else if ( part_n_dim && go_bwd ) - area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end ); else if ( part_m_dim && go_fwd ) - area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end ); else // ( part_m_dim && go_bwd ) - area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end ); width = end - start; diff --git a/windows/build/libblis-symbols.def b/windows/build/libblis-symbols.def index 983292b05..13ae1c60c 100644 --- a/windows/build/libblis-symbols.def +++ b/windows/build/libblis-symbols.def @@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt bli_thread_get_jr_nt bli_thread_get_num_threads bli_thread_get_pc_nt -bli_thread_get_range_b2t -bli_thread_get_range_l2r -bli_thread_get_range_mdim -bli_thread_get_range_ndim -bli_thread_get_range_r2l -bli_thread_get_range_sub -bli_thread_get_range_t2b -bli_thread_get_range_weighted_b2t -bli_thread_get_range_weighted_l2r -bli_thread_get_range_weighted_r2l -bli_thread_get_range_weighted_sub -bli_thread_get_range_weighted_t2b -bli_thread_get_range_width_l +bli_thread_range_b2t +bli_thread_range_l2r +bli_thread_range_mdim +bli_thread_range_ndim +bli_thread_range_r2l +bli_thread_range_sub +bli_thread_range_t2b +bli_thread_range_weighted_b2t +bli_thread_range_weighted_l2r +bli_thread_range_weighted_r2l +bli_thread_range_weighted_sub +bli_thread_range_weighted_t2b +bli_thread_range_width_l bli_thread_init bli_thread_init_rntm bli_thread_init_rntm_from_env