Renamed bli_thread_obarrier(), _obroadcast().

Details:
- Renamed two bli_thread_*() APIs:
    bli_thread_obarrier()   -> bli_thread_barrier()
    bli_thread_obroadcast() -> bli_thread_broadcast()
  The 'o' was a leftover from when thrcomm_t objects tracked both
  "inner" and "outer" communicators. They have long since been
  simplified to only support the latter, and thus the 'o' is
  superfluous.

Change-Id: If9ec9a2383dfb02e1cfc74918f87a1fabddbd55b
This commit is contained in:
Field G. Van Zee
2020-02-25 14:50:53 -06:00
committed by Devrajegowda, Kiran
parent 6a957d7247
commit 9e76059f15
22 changed files with 132 additions and 86 deletions

View File

@@ -18,6 +18,7 @@ but many others have contributed code and feedback, including
Matthew Brett @matthew-brett (University of Birmingham)
Jed Brown @jedbrown (Argonne National Laboratory)
Robin Christ @robinchrist
Mat Cross @matcross (NAG)
Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany)
Jeff Diamond (Oracle)
Johannes Dieterich @iotamudelta

View File

@@ -90,9 +90,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,

View File

@@ -79,9 +79,11 @@ void bli_cntx_init_knl( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,

View File

@@ -150,9 +150,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,

View File

@@ -71,9 +71,11 @@ void bli_cntx_init_skx( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,

View File

@@ -83,9 +83,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
16,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,

View File

@@ -80,9 +80,12 @@ void bli_cntx_init_zen2( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
16,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,

View File

@@ -667,7 +667,7 @@ if ( col_stored ) { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
@@ -691,7 +691,7 @@ else { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*

View File

@@ -73,6 +73,6 @@ void bli_unpackm_int
}
// Barrier so that unpacking is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -50,7 +50,7 @@ void bli_l3_packm
siz_t size_needed;
// FGVZ: Not sure why we need this barrier, but we do.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Every thread initializes x_pack and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void bli_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the contents of the chief thread's local mem_t entry to the
// mem_t field in this thread's control tree node.
@@ -146,7 +146,7 @@ void bli_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
@@ -159,7 +159,7 @@ void bli_l3_packm
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}
}
@@ -182,6 +182,6 @@ void bli_l3_packm
);
// Barrier so that packing is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -237,7 +237,7 @@ if ( col_stored ) { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
@@ -261,7 +261,7 @@ else { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*

View File

@@ -84,7 +84,7 @@ void bli_gemm_blk_var3
bli_thrinfo_sub_node( thread )
);
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it

View File

@@ -66,7 +66,7 @@ void bli_gemm_int
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}
@@ -80,7 +80,7 @@ void bli_gemm_int
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}

View File

@@ -123,7 +123,7 @@ void bli_trsm_blk_var1
// We must execute a barrier here because the upcoming rank-k update
// requires the packed matrix B to be fully updated by the trsm
// subproblem.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Isolate the remaining part of the column panel matrix A, which we do by
// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending

View File

@@ -85,7 +85,7 @@ void bli_trsm_blk_var3
);
//bli_thread_ibarrier( thread );
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure

View File

@@ -68,7 +68,7 @@ void bli_trsm_int
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}
@@ -119,7 +119,7 @@ void bli_trsm_int
}
// FGVZ->TMS: Is this barrier still needed?
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( rntm, cntl, thread );

View File

@@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
// Broadcast the temporary array to all threads in the parent's
// communicator.
new_comms = bli_thread_obroadcast( thread_par, new_comms );
new_comms = bli_thread_broadcast( thread_par, new_comms );
// Chiefs in the child communicator allocate the communicator
// object and store it in the array element corresponding to the
@@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
if ( child_comm_id == 0 )
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
@@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
NULL // sub_node
);
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// The parent's chief thread frees the temporary array of thrcomm_t
// pointers.
@@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
const dim_t child_comm_id = parent_comm_id % child_nt_in;
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
// parent's chief-ness is equivalent to checking for chief-ness in the new
@@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
// Broadcast the new thrcomm_t address to the other threads in the
// parent's group.
new_comm = bli_thread_obroadcast( thread_par, new_comm );
new_comm = bli_thread_broadcast( thread_par, new_comm );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
@@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
NULL // sub_node
);
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
return thread_chl;
}

View File

@@ -171,12 +171,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )
// other thrinfo_t-related functions
static void* bli_thread_obroadcast( thrinfo_t* t, void* p )
static void* bli_thread_broadcast( thrinfo_t* t, void* p )
{
return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
}
static void bli_thread_obarrier( thrinfo_t* t )
static void bli_thread_barrier( thrinfo_t* t )
{
bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
}

View File

@@ -65,6 +65,38 @@ typedef union
double d[2];
}v2dd_t;
// return a mask which indicates either:
// - v1 > v2
// - v1 is NaN and v2 is not
// assumes that idx(v1) > idx(v2)
// all "OQ" comparisons false if either operand NaN
#define CMP256( dt, v1, v2 ) \
_mm256_or_p##dt( _mm256_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* v1 > v2 || */ \
_mm256_andnot_p##dt( _mm256_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \
_mm256_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) */ \
) \
);
// return a mask which indicates either:
// - v1 > v2
// - v1 is NaN and v2 is not
// - v1 == v2 (maybe == NaN) and i1 < i2
// all "OQ" comparisons false if either operand NaN
#define CMP128( dt, v1, v2, i1, i2 ) \
_mm_or_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* ( v1 > v2 || */ \
_mm_andnot_p##dt( _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \
_mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) ) || */ \
) \
), \
_mm_and_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_EQ_OQ ), /* ( ( v1 == v2 || */ \
_mm_and_p##dt( _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ), /* ( isnan(v1) && */ \
_mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ) /* isnan(v2) ) ) && */ \
) \
), \
_mm_cmp_p##dt( i1, i2, _CMP_LT_OQ ) /* i1 < i2 ) */ \
) \
);
// -----------------------------------------------------------------------------
void bli_samaxv_zen_int
@@ -122,8 +154,8 @@ void bli_samaxv_zen_int
the previous largest, save it and its index. If NaN is
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
behavior mimics that of LAPACK's i?amax(). */
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
{
abs_chi1_max = abs_chi1;
i_max_l = i;
@@ -157,7 +189,7 @@ void bli_samaxv_zen_int
// Get the absolute value of the vector element.
x_vec.v = _mm256_andnot_ps( sign_mask.v, x_vec.v );
mask_vec.v = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS );
mask_vec.v = CMP256( s, x_vec.v, max_vec.v );
max_vec.v = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v );
maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v );
@@ -166,33 +198,34 @@ void bli_samaxv_zen_int
x += num_vec_elements;
}
max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 );
max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 );
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 );
max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 );
maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 );
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
if ( max_vec_lo.f[0] > max_vec_lo.f[1] )
{
abs_chi1_max = max_vec_lo.f[0];
i_max_l = maxInx_vec_lo.f[0];
}
else
{
abs_chi1_max = max_vec_lo.f[1];
i_max_l = maxInx_vec_lo.f[1];
}
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 );
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 1 );
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 );
mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
abs_chi1_max = max_vec_lo.f[0];
i_max_l = maxInx_vec_lo.f[0];
for ( i = n - n_left; i < n; i++ )
{
@@ -208,8 +241,8 @@ void bli_samaxv_zen_int
the previous largest, save it and its index. If NaN is
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
behavior mimics that of LAPACK's i?amax(). */
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
{
abs_chi1_max = abs_chi1;
i_max_l = i;
@@ -286,8 +319,8 @@ void bli_damaxv_zen_int
the previous largest, save it and its index. If NaN is
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
behavior mimics that of LAPACK's i?amax(). */
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
{
abs_chi1_max = abs_chi1;
i_max_l = i;
@@ -321,7 +354,7 @@ void bli_damaxv_zen_int
// Get the absolute value of the vector element.
x_vec.v = _mm256_andnot_pd( sign_mask.v, x_vec.v );
mask_vec.v = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS );
mask_vec.v = CMP256( d, x_vec.v, max_vec.v );
max_vec.v = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
@@ -330,26 +363,26 @@ void bli_damaxv_zen_int
x += num_vec_elements;
}
max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 );
max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 );
mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 );
max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 );
maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
max_vec_hi.v = _mm_permute_pd( max_vec_lo.v, 1 );
maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
if ( max_vec_lo.d[0] > max_vec_lo.d[1] )
{
abs_chi1_max = max_vec_lo.d[0];
i_max_l = maxInx_vec_lo.d[0];
}
else
{
abs_chi1_max = max_vec_lo.d[1];
i_max_l = maxInx_vec_lo.d[1];
}
abs_chi1_max = max_vec_lo.d[0];
i_max_l = maxInx_vec_lo.d[0];
for ( i = n - n_left; i < n; i++ )
{
@@ -363,10 +396,9 @@ void bli_damaxv_zen_int
/* If the absolute value of the current element exceeds that of
the previous largest, save it and its index. If NaN is
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
encountered, return the index of the first NaN. This
behavior mimics that of LAPACK's i?amax(). */
if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
{
abs_chi1_max = abs_chi1;
i_max_l = i;

View File

@@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */ \
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
{ \
abs_chi1_max = abs_chi1; \
i_max_l = i; \
@@ -129,7 +129,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
encountered, then treat it the same as if it were a valid
value that was smaller than any previously seen. This
behavior mimics that of LAPACK's ?lange(). */ \
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \
{ \
abs_chi1_max = abs_chi1; \
i_max_l = i; \

View File

@@ -51,7 +51,7 @@ void blx_l3_packm
siz_t size_needed;
// FGVZ: Not sure why we need this barrier, but we do.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Every thread initializes x_pack and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void blx_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the contents of the chief thread's local mem_t entry to the
// mem_t field in this thread's control tree node.
@@ -142,7 +142,7 @@ void blx_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
@@ -155,7 +155,7 @@ void blx_l3_packm
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}
}
@@ -178,6 +178,6 @@ void blx_l3_packm
);
// Barrier so that packing is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -73,7 +73,7 @@ void blx_gemm_blk_var3
bli_thrinfo_sub_node( thread )
);
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it