mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Renamed bli_thread_obarrier(), _obroadcast().
Details:
- Renamed two bli_thread_*() APIs:
bli_thread_obarrier() -> bli_thread_barrier()
bli_thread_obroadcast() -> bli_thread_broadcast()
The 'o' was a leftover from when thrcomm_t objects tracked both
"inner" and "outer" communicators. They have long since been
simplified to only support the latter, and thus the 'o' is
superfluous.
Change-Id: If9ec9a2383dfb02e1cfc74918f87a1fabddbd55b
This commit is contained in:
committed by
Devrajegowda, Kiran
parent
6a957d7247
commit
9e76059f15
1
CREDITS
1
CREDITS
@@ -18,6 +18,7 @@ but many others have contributed code and feedback, including
|
||||
Matthew Brett @matthew-brett (University of Birmingham)
|
||||
Jed Brown @jedbrown (Argonne National Laboratory)
|
||||
Robin Christ @robinchrist
|
||||
Mat Cross @matcross (NAG)
|
||||
Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany)
|
||||
Jeff Diamond (Oracle)
|
||||
Johannes Dieterich @iotamudelta
|
||||
|
||||
@@ -90,9 +90,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
|
||||
@@ -79,9 +79,11 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
|
||||
@@ -150,9 +150,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
|
||||
@@ -71,9 +71,11 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
|
||||
@@ -83,9 +83,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
16,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
|
||||
@@ -80,9 +80,12 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
16,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
|
||||
@@ -667,7 +667,7 @@ if ( col_stored ) { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
@@ -691,7 +691,7 @@ else { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
|
||||
@@ -73,6 +73,6 @@ void bli_unpackm_int
|
||||
}
|
||||
|
||||
// Barrier so that unpacking is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ void bli_l3_packm
|
||||
siz_t size_needed;
|
||||
|
||||
// FGVZ: Not sure why we need this barrier, but we do.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Every thread initializes x_pack and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
@@ -102,7 +102,7 @@ void bli_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the contents of the chief thread's local mem_t entry to the
|
||||
// mem_t field in this thread's control tree node.
|
||||
@@ -146,7 +146,7 @@ void bli_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
@@ -159,7 +159,7 @@ void bli_l3_packm
|
||||
// will already have the cached values in their local control
|
||||
// trees' mem_t entries, currently pointed to by cntl_mem_p.
|
||||
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,6 +182,6 @@ void bli_l3_packm
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -237,7 +237,7 @@ if ( col_stored ) { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
@@ -261,7 +261,7 @@ else { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
|
||||
@@ -84,7 +84,7 @@ void bli_gemm_blk_var3
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
|
||||
@@ -66,7 +66,7 @@ void bli_gemm_int
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -80,7 +80,7 @@ void bli_gemm_int
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ void bli_trsm_blk_var1
|
||||
// We must execute a barrier here because the upcoming rank-k update
|
||||
// requires the packed matrix B to be fully updated by the trsm
|
||||
// subproblem.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Isolate the remaining part of the column panel matrix A, which we do by
|
||||
// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
|
||||
|
||||
@@ -85,7 +85,7 @@ void bli_trsm_blk_var3
|
||||
);
|
||||
|
||||
//bli_thread_ibarrier( thread );
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
|
||||
@@ -68,7 +68,7 @@ void bli_trsm_int
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ void bli_trsm_int
|
||||
}
|
||||
|
||||
// FGVZ->TMS: Is this barrier still needed?
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Create the next node in the thrinfo_t structure.
|
||||
bli_thrinfo_grow( rntm, cntl, thread );
|
||||
|
||||
@@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
|
||||
// Broadcast the temporary array to all threads in the parent's
|
||||
// communicator.
|
||||
new_comms = bli_thread_obroadcast( thread_par, new_comms );
|
||||
new_comms = bli_thread_broadcast( thread_par, new_comms );
|
||||
|
||||
// Chiefs in the child communicator allocate the communicator
|
||||
// object and store it in the array element corresponding to the
|
||||
@@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
if ( child_comm_id == 0 )
|
||||
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// All threads create a new thrinfo_t node using the communicator
|
||||
// that was created by their chief, as identified by parent_work_id.
|
||||
@@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
NULL // sub_node
|
||||
);
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// The parent's chief thread frees the temporary array of thrcomm_t
|
||||
// pointers.
|
||||
@@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
const dim_t child_comm_id = parent_comm_id % child_nt_in;
|
||||
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
|
||||
// parent's chief-ness is equivalent to checking for chief-ness in the new
|
||||
@@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
|
||||
// Broadcast the new thrcomm_t address to the other threads in the
|
||||
// parent's group.
|
||||
new_comm = bli_thread_obroadcast( thread_par, new_comm );
|
||||
new_comm = bli_thread_broadcast( thread_par, new_comm );
|
||||
|
||||
// All threads create a new thrinfo_t node using the communicator
|
||||
// that was created by their chief, as identified by parent_work_id.
|
||||
@@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
NULL // sub_node
|
||||
);
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
return thread_chl;
|
||||
}
|
||||
|
||||
@@ -171,12 +171,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )
|
||||
|
||||
// other thrinfo_t-related functions
|
||||
|
||||
static void* bli_thread_obroadcast( thrinfo_t* t, void* p )
|
||||
static void* bli_thread_broadcast( thrinfo_t* t, void* p )
|
||||
{
|
||||
return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
|
||||
}
|
||||
|
||||
static void bli_thread_obarrier( thrinfo_t* t )
|
||||
static void bli_thread_barrier( thrinfo_t* t )
|
||||
{
|
||||
bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
|
||||
}
|
||||
|
||||
@@ -65,6 +65,38 @@ typedef union
|
||||
double d[2];
|
||||
}v2dd_t;
|
||||
|
||||
// return a mask which indicates either:
|
||||
// - v1 > v2
|
||||
// - v1 is NaN and v2 is not
|
||||
// assumes that idx(v1) > idx(v2)
|
||||
// all "OQ" comparisons false if either operand NaN
|
||||
#define CMP256( dt, v1, v2 ) \
|
||||
_mm256_or_p##dt( _mm256_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* v1 > v2 || */ \
|
||||
_mm256_andnot_p##dt( _mm256_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \
|
||||
_mm256_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) */ \
|
||||
) \
|
||||
);
|
||||
|
||||
// return a mask which indicates either:
|
||||
// - v1 > v2
|
||||
// - v1 is NaN and v2 is not
|
||||
// - v1 == v2 (maybe == NaN) and i1 < i2
|
||||
// all "OQ" comparisons false if either operand NaN
|
||||
#define CMP128( dt, v1, v2, i1, i2 ) \
|
||||
_mm_or_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* ( v1 > v2 || */ \
|
||||
_mm_andnot_p##dt( _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \
|
||||
_mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) ) || */ \
|
||||
) \
|
||||
), \
|
||||
_mm_and_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_EQ_OQ ), /* ( ( v1 == v2 || */ \
|
||||
_mm_and_p##dt( _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ), /* ( isnan(v1) && */ \
|
||||
_mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ) /* isnan(v2) ) ) && */ \
|
||||
) \
|
||||
), \
|
||||
_mm_cmp_p##dt( i1, i2, _CMP_LT_OQ ) /* i1 < i2 ) */ \
|
||||
) \
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_samaxv_zen_int
|
||||
@@ -122,8 +154,8 @@ void bli_samaxv_zen_int
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
behavior mimics that of LAPACK's i?amax(). */
|
||||
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
@@ -157,7 +189,7 @@ void bli_samaxv_zen_int
|
||||
// Get the absolute value of the vector element.
|
||||
x_vec.v = _mm256_andnot_ps( sign_mask.v, x_vec.v );
|
||||
|
||||
mask_vec.v = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS );
|
||||
mask_vec.v = CMP256( s, x_vec.v, max_vec.v );
|
||||
|
||||
max_vec.v = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v );
|
||||
maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v );
|
||||
@@ -166,33 +198,34 @@ void bli_samaxv_zen_int
|
||||
x += num_vec_elements;
|
||||
}
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 );
|
||||
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 );
|
||||
maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
|
||||
maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 );
|
||||
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
|
||||
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
if ( max_vec_lo.f[0] > max_vec_lo.f[1] )
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.f[0];
|
||||
i_max_l = maxInx_vec_lo.f[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.f[1];
|
||||
i_max_l = maxInx_vec_lo.f[1];
|
||||
}
|
||||
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 );
|
||||
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
|
||||
|
||||
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 1 );
|
||||
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 );
|
||||
|
||||
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
abs_chi1_max = max_vec_lo.f[0];
|
||||
i_max_l = maxInx_vec_lo.f[0];
|
||||
|
||||
for ( i = n - n_left; i < n; i++ )
|
||||
{
|
||||
@@ -208,8 +241,8 @@ void bli_samaxv_zen_int
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
behavior mimics that of LAPACK's i?amax(). */
|
||||
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
@@ -286,8 +319,8 @@ void bli_damaxv_zen_int
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
behavior mimics that of LAPACK's i?amax(). */
|
||||
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
@@ -321,7 +354,7 @@ void bli_damaxv_zen_int
|
||||
// Get the absolute value of the vector element.
|
||||
x_vec.v = _mm256_andnot_pd( sign_mask.v, x_vec.v );
|
||||
|
||||
mask_vec.v = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS );
|
||||
mask_vec.v = CMP256( d, x_vec.v, max_vec.v );
|
||||
|
||||
max_vec.v = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
|
||||
maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
|
||||
@@ -330,26 +363,26 @@ void bli_damaxv_zen_int
|
||||
x += num_vec_elements;
|
||||
}
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 );
|
||||
mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 );
|
||||
maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
|
||||
maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
|
||||
|
||||
mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_hi.v = _mm_permute_pd( max_vec_lo.v, 1 );
|
||||
maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
|
||||
|
||||
mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
if ( max_vec_lo.d[0] > max_vec_lo.d[1] )
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.d[0];
|
||||
i_max_l = maxInx_vec_lo.d[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.d[1];
|
||||
i_max_l = maxInx_vec_lo.d[1];
|
||||
}
|
||||
abs_chi1_max = max_vec_lo.d[0];
|
||||
i_max_l = maxInx_vec_lo.d[0];
|
||||
|
||||
for ( i = n - n_left; i < n; i++ )
|
||||
{
|
||||
@@ -363,10 +396,9 @@ void bli_damaxv_zen_int
|
||||
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
encountered, return the index of the first NaN. This
|
||||
behavior mimics that of LAPACK's i?amax(). */
|
||||
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
|
||||
@@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */ \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
i_max_l = i; \
|
||||
@@ -129,7 +129,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */ \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
i_max_l = i; \
|
||||
|
||||
@@ -51,7 +51,7 @@ void blx_l3_packm
|
||||
siz_t size_needed;
|
||||
|
||||
// FGVZ: Not sure why we need this barrier, but we do.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Every thread initializes x_pack and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
@@ -102,7 +102,7 @@ void blx_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the contents of the chief thread's local mem_t entry to the
|
||||
// mem_t field in this thread's control tree node.
|
||||
@@ -142,7 +142,7 @@ void blx_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
@@ -155,7 +155,7 @@ void blx_l3_packm
|
||||
// will already have the cached values in their local control
|
||||
// trees' mem_t entries, currently pointed to by cntl_mem_p.
|
||||
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,6 +178,6 @@ void blx_l3_packm
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ void blx_gemm_blk_var3
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
|
||||
Reference in New Issue
Block a user