mirror of
https://github.com/amd/blis.git
synced 2026-03-23 02:37:25 +00:00
Updated setting/querying pack schema (cntx->cntl).
- Query pack schemas in level-3 bli_*_front() functions and store those values in the schema bitfields of the correponding obj_t's when the cntx's method is not BLIS_NAT. (When method is BLIS_NAT, the default native schemas are stored to the obj_t's.) - In bli_l3_cntl_create_if(), query the schemas stored to the obj_t's in bli_*_front(), clear the schema bitfields, and pass the queried values into bli_gemm_cntl_create() and bli_trsm_cntl_create(). - Updated APIs for bli_gemm_cntl_create() and bli_trsm_cntl_create() to take schemas for A and B, and use these values to initialize the appropriate control tree nodes. (Also cpp-disabled the panel-block cntl tree creation variant, bli_gemmpb_cntl_create(), as it has not been employed by BLIS in quite some time.) - Simplified querying of schema in bli_packm_init() thanks to above changes. - Updated openmp and pthreads definitions of bli_l3_thread_decorator() so that thread-local aliases of matrix operands are guaranteed, even if aliasing is disabled within the internal back-end functions (e.g. bli_gemm_int.c). Also added a comment to bli_thrcomm_single.c explaining why the extra aliasing is not needed there. - Change bli_gemm() and level-3 friends so that the operation's ind() function is called only if all matrix operands have the same datatype, and only if that datatype is complex. The former condition is needed in preparation for work related to mixed domain operands, while the latter helps with readability, especially for those who don't want to venture into frame/ind. - Reshuffled arguments in bli_cntx_set_thrloop_from_env() to be consistent with BLIS calling conventions (modified argument(s) are last), and updated all invocations in the level-3 _front() functions. - Comment updates to bli_cntx_set_thrloop_from_env().
This commit is contained in:
@@ -56,8 +56,8 @@ siz_t bli_packm_init
|
||||
bool_t does_invert_diag;
|
||||
bool_t rev_iter_if_upper;
|
||||
bool_t rev_iter_if_lower;
|
||||
//pack_t pack_schema;
|
||||
packbuf_t pack_buf_type;
|
||||
pack_t schema;
|
||||
//packbuf_t pack_buf_type;
|
||||
siz_t size_needed;
|
||||
|
||||
// Check parameters.
|
||||
@@ -70,8 +70,8 @@ siz_t bli_packm_init
|
||||
does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl );
|
||||
rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl );
|
||||
rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl );
|
||||
//pack_schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
//pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
|
||||
|
||||
#if 0
|
||||
// Let us now check to see if the object has already been packed. First
|
||||
@@ -112,30 +112,51 @@ siz_t bli_packm_init
|
||||
return 0;
|
||||
}
|
||||
|
||||
// We now ignore the pack_schema field in the control tree and
|
||||
// extract the schema from the context, depending on whether we are
|
||||
// preparing to pack a block of A or panel of B. For A and B, we must
|
||||
// obtain the schema from the context since the induced methods reuse
|
||||
// the same control trees used by native execution, and those induced
|
||||
// methods specify the schema used by the current execution phase
|
||||
// within the context (whereas the control tree does not change).
|
||||
#if 0
|
||||
pack_t schema;
|
||||
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
|
||||
if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
schema = bli_cntx_schema_a_block( cntx );
|
||||
// We now ignore the pack_schema field in the control tree and
|
||||
// extract the schema from the context, depending on whether we are
|
||||
// preparing to pack a block of A or panel of B. For A and B, we must
|
||||
// obtain the schema from the context since the induced methods reuse
|
||||
// the same control trees used by native execution, and those induced
|
||||
// methods specify the schema used by the current execution phase
|
||||
// within the context (whereas the control tree does not change).
|
||||
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
|
||||
{
|
||||
schema = bli_cntx_schema_a_block( cntx );
|
||||
}
|
||||
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
|
||||
{
|
||||
schema = bli_cntx_schema_b_panel( cntx );
|
||||
}
|
||||
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
}
|
||||
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
|
||||
else // ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
schema = bli_cntx_schema_b_panel( cntx );
|
||||
// For native execution, we obtain the schema from the control tree
|
||||
// node. (Notice that it doesn't matter if the pack_buf_type is for
|
||||
// A or B.)
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
// This is no longer needed now that we branch between native and
|
||||
// non-native cases above.
|
||||
#if 0
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
// If we get a request to pack C for some reason, it is likely
|
||||
// not part of an induced method, and so it would be safe (and
|
||||
// necessary) to read the pack schema from the control tree.
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Prepare a few other variables based on properties of the control
|
||||
// tree.
|
||||
|
||||
@@ -45,6 +45,21 @@ void bli_l3_cntl_create_if
|
||||
cntl_t** cntl_use
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects. Notice that we do this even if the
|
||||
// caller passed in a custom control tree; that's because we still need
|
||||
// to reset the pack schema of a and b, which were modified by the
|
||||
// operation's _front() function.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// If the control tree pointer is NULL, we construct a default
|
||||
// tree as a function of the operation family.
|
||||
if ( cntl_orig == NULL )
|
||||
@@ -53,7 +68,7 @@ void bli_l3_cntl_create_if
|
||||
family == BLIS_HERK ||
|
||||
family == BLIS_TRMM )
|
||||
{
|
||||
*cntl_use = bli_gemm_cntl_create( family );
|
||||
*cntl_use = bli_gemm_cntl_create( family, schema_a, schema_b );
|
||||
}
|
||||
else // if ( family == BLIS_TRSM )
|
||||
{
|
||||
@@ -62,7 +77,7 @@ void bli_l3_cntl_create_if
|
||||
if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
|
||||
else side = BLIS_RIGHT;
|
||||
|
||||
*cntl_use = bli_trsm_cntl_create( side );
|
||||
*cntl_use = bli_trsm_cntl_create( side, schema_a, schema_b );
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
@@ -57,20 +57,25 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
\
|
||||
BLIS_OAPI_CNTX_DECL \
|
||||
\
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
This function will call native execution for real domain problems.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. */ \
|
||||
PASTEMAC(opname,ind) \
|
||||
( \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, \
|
||||
cntx \
|
||||
); \
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
chooser function and proceed directly with native execution, which is
|
||||
where mixed datatype support will be implemented (if at all). */ \
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
|
||||
bli_obj_dt( b ) == bli_obj_dt( c ) && \
|
||||
bli_obj_is_complex( c ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( gemm )
|
||||
@@ -96,16 +101,25 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
\
|
||||
BLIS_OAPI_CNTX_DECL \
|
||||
\
|
||||
PASTEMAC(opname,ind) \
|
||||
( \
|
||||
side, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, \
|
||||
cntx \
|
||||
); \
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
chooser function and proceed directly with native execution, which is
|
||||
where mixed datatype support will be implemented (if at all). */ \
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
|
||||
bli_obj_dt( b ) == bli_obj_dt( c ) && \
|
||||
bli_obj_is_complex( c ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx ); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( hemm )
|
||||
@@ -129,14 +143,24 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
\
|
||||
BLIS_OAPI_CNTX_DECL \
|
||||
\
|
||||
PASTEMAC(opname,ind) \
|
||||
( \
|
||||
alpha, \
|
||||
a, \
|
||||
beta, \
|
||||
c, \
|
||||
cntx \
|
||||
); \
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
chooser function and proceed directly with native execution, which is
|
||||
where mixed datatype support will be implemented (if at all). */ \
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
|
||||
bli_obj_is_complex( c ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( alpha, a, beta, c, cntx ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, beta, c, cntx ); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( herk )
|
||||
@@ -159,14 +183,24 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
\
|
||||
BLIS_OAPI_CNTX_DECL \
|
||||
\
|
||||
PASTEMAC(opname,ind) \
|
||||
( \
|
||||
side, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
cntx \
|
||||
); \
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
chooser function and proceed directly with native execution, which is
|
||||
where mixed datatype support will be implemented (if at all). */ \
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \
|
||||
bli_obj_is_complex( b ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( side, alpha, a, b, cntx ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( side, alpha, a, b, cntx ); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( trmm )
|
||||
|
||||
@@ -36,17 +36,21 @@
|
||||
|
||||
cntl_t* bli_gemm_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
opid_t family,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
return bli_gemmbp_cntl_create( family );
|
||||
return bli_gemmbp_cntl_create( family, schema_a, schema_b );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
opid_t family,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var2;
|
||||
@@ -82,7 +86,7 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
schema_a, // normally BLIS_PACKED_ROW_PANELS
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
gemm_cntl_bp_bu
|
||||
);
|
||||
@@ -106,7 +110,7 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
schema_b, // normally BLIS_PACKED_COL_PANELS
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
gemm_cntl_op_bp
|
||||
);
|
||||
@@ -134,6 +138,10 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// This control tree creation function is disabled because it is no longer used.
|
||||
// (It was originally created in the run up to publishing the 1m journal article,
|
||||
// but was disabled to reduce complexity.)
|
||||
#if 0
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
@@ -223,6 +231,7 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
|
||||
return gemm_cntl_vl_mm;
|
||||
}
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -34,20 +34,26 @@
|
||||
|
||||
cntl_t* bli_gemm_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
opid_t family,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
opid_t family,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
);
|
||||
|
||||
#if 0
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
opid_t family,
|
||||
);
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -87,10 +87,34 @@ void bli_gemm_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -88,10 +88,34 @@ void bli_hemm_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_HEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -106,10 +106,38 @@ void bli_her2k_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_HER2K,
|
||||
BLIS_LEFT, // ignored for her[2]k/syr[2]k
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bh_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &bh_local );
|
||||
bli_obj_set_pack_schema( schema_a, &b_local );
|
||||
bli_obj_set_pack_schema( schema_b, &ah_local );
|
||||
}
|
||||
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
|
||||
|
||||
@@ -86,10 +86,34 @@ void bli_herk_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_HERK,
|
||||
BLIS_LEFT, // ignored for her[2]k/syr[2]k
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &ah_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -87,10 +87,34 @@ void bli_symm_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_SYMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -87,10 +87,38 @@ void bli_syr2k_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_SYR2K,
|
||||
BLIS_LEFT, // ignored for her[2]k/syr[2]k
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bt_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &bt_local );
|
||||
bli_obj_set_pack_schema( schema_a, &b_local );
|
||||
bli_obj_set_pack_schema( schema_b, &at_local );
|
||||
}
|
||||
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
|
||||
|
||||
@@ -80,10 +80,34 @@ void bli_syrk_front
|
||||
}
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_SYRK,
|
||||
BLIS_LEFT, // ignored for her[2]k/syr[2]k
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &at_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -130,10 +130,34 @@ void bli_trmm_front
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_TRMM,
|
||||
side,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -129,10 +129,34 @@ void bli_trmm3_front
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_TRMM3,
|
||||
side,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -36,16 +36,21 @@
|
||||
|
||||
cntl_t* bli_trsm_cntl_create
|
||||
(
|
||||
side_t side
|
||||
side_t side,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create();
|
||||
else return bli_trsm_r_cntl_create();
|
||||
if ( bli_is_left( side ) )
|
||||
return bli_trsm_l_cntl_create( schema_a, schema_b );
|
||||
else
|
||||
return bli_trsm_r_cntl_create( schema_a, schema_b );
|
||||
}
|
||||
|
||||
cntl_t* bli_trsm_l_cntl_create
|
||||
(
|
||||
void
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
@@ -79,7 +84,7 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
TRUE, // do NOT invert diagonal
|
||||
TRUE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
schema_a, // normally BLIS_PACKED_ROW_PANELS
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
trsm_cntl_bp_bu
|
||||
);
|
||||
@@ -103,7 +108,7 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
schema_b, // normally BLIS_PACKED_COL_PANELS
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
trsm_cntl_op_bp
|
||||
);
|
||||
@@ -131,7 +136,8 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
|
||||
cntl_t* bli_trsm_r_cntl_create
|
||||
(
|
||||
void
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
@@ -165,7 +171,7 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
schema_a, // normally BLIS_PACKED_ROW_PANELS
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
trsm_cntl_bp_bu
|
||||
);
|
||||
@@ -189,7 +195,7 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
TRUE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
TRUE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
schema_b, // normally BLIS_PACKED_COL_PANELS
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
trsm_cntl_op_bp
|
||||
);
|
||||
|
||||
@@ -34,17 +34,21 @@
|
||||
|
||||
cntl_t* bli_trsm_cntl_create
|
||||
(
|
||||
side_t side
|
||||
side_t side,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
);
|
||||
|
||||
cntl_t* bli_trsm_l_cntl_create
|
||||
(
|
||||
void
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
);
|
||||
|
||||
cntl_t* bli_trsm_r_cntl_create
|
||||
(
|
||||
void
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
);
|
||||
|
||||
void bli_trsm_cntl_free
|
||||
|
||||
@@ -121,10 +121,34 @@ void bli_trsm_front
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ) );
|
||||
bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
BLIS_TRSM,
|
||||
side,
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
cntx
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -869,10 +869,10 @@ void bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
opid_t l3_op,
|
||||
side_t side,
|
||||
cntx_t* cntx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k
|
||||
dim_t k,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
dim_t jc, pc, ic, jr, ir;
|
||||
@@ -988,7 +988,7 @@ void bli_cntx_set_thrloop_from_env
|
||||
);
|
||||
}
|
||||
}
|
||||
else // if ( l3_op == BLIS_GEMM || l3_op == BLIS_HERK )
|
||||
else // any other level-3 operation besides trmm/trsm
|
||||
{
|
||||
bli_cntx_set_thrloop
|
||||
(
|
||||
|
||||
@@ -674,12 +674,15 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... );
|
||||
void bli_cntx_set_l1v_kers( dim_t n_kers, ... );
|
||||
void bli_cntx_set_packm_kers( dim_t n_kers, ... );
|
||||
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
|
||||
side_t side,
|
||||
cntx_t* cntx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k );
|
||||
void bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
opid_t l3_op,
|
||||
side_t side,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_cntx_print( cntx_t* cntx );
|
||||
|
||||
|
||||
@@ -231,11 +231,18 @@ void bli_l3_thread_decorator
|
||||
{
|
||||
dim_t id = omp_get_thread_num();
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass into the thread functions.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
|
||||
bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
|
||||
@@ -243,17 +250,17 @@ void bli_l3_thread_decorator
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
c,
|
||||
&c_t,
|
||||
cntx,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the control tree, if one was created locally.
|
||||
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
|
||||
bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
threads[id] = thread;
|
||||
|
||||
@@ -161,11 +161,18 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
dim_t id = data->id;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass into the thread function.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
|
||||
bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
|
||||
@@ -173,17 +180,17 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
c,
|
||||
&c_t,
|
||||
cntx,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the control tree, if one was created locally.
|
||||
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
|
||||
bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( thread );
|
||||
|
||||
@@ -94,6 +94,12 @@ void bli_l3_thread_decorator
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). (We only needed thread-local copies so each could
|
||||
// safely reset their internal (beta) scalars on c after the first
|
||||
// iteration of the pc (kc) loop.)
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
|
||||
|
||||
|
||||
Reference in New Issue
Block a user