mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Added sup functionality for SYRK
Details: - Added bli_syrksup function that internally uses gemmt implementation. - Modified OAPI of syrk to call SUP before proceeding to the conventional implementation. - Copied gemmsup threshold function for syrk temporarily. Thresholds are yet to be derived for syrk. Change-Id: I751c6bd62bc76a3e4717f77c5cb33f19b759151d
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -306,9 +306,47 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
BLIS_OAPI_EX_PARAMS \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
bli_init_once(); \
|
||||
\
|
||||
BLIS_OAPI_EX_DECLS \
|
||||
\
|
||||
/* If C has a zero dimension, return early. */ \
|
||||
if ( bli_obj_has_zero_dim( c ) ) {\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* If alpha or A or B has a zero dimension, \
|
||||
* scale C by beta and return early. */ \
|
||||
\
|
||||
if( bli_obj_equals( alpha, &BLIS_ZERO ) || \
|
||||
bli_obj_has_zero_dim( a ) ) \
|
||||
{ \
|
||||
bli_scalm( beta, c ); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* If the rntm is non-NULL, it may indicate that we should forgo SUP handling altogether. */ \
|
||||
bool enable_sup = TRUE; \
|
||||
if( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
|
||||
\
|
||||
if( enable_sup ) \
|
||||
{ \
|
||||
/* Execute the small/unpacked oapi handler.
|
||||
* If it finds that the problem does not fall within the
|
||||
* thresholds that define "small", or for some other reason
|
||||
* decides not to use the small/unpacked implementation,
|
||||
* the function returns with BLIS_FAILURE, which causes excution
|
||||
* to proceed forward towards conventional implementation, */ \
|
||||
\
|
||||
err_t result = PASTEMAC(opname, sup) ( alpha, a, beta, c, cntx, rntm ); \
|
||||
if( result == BLIS_SUCCESS ) { \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
@@ -330,9 +368,44 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( herk )
|
||||
GENFRONT( syrk )
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,EX_SUF) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* beta, \
|
||||
obj_t* c \
|
||||
BLIS_OAPI_EX_PARAMS \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
BLIS_OAPI_EX_DECLS \
|
||||
\
|
||||
/* Only proceed with an induced method if all operands have the same
|
||||
(complex) datatype. If any datatypes differ, skip the induced method
|
||||
chooser function and proceed directly with native execution, which is
|
||||
where mixed datatype support will be implemented (if at all). */ \
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
|
||||
bli_obj_is_complex( c ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( alpha, a, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
}
|
||||
GENFRONT(herk)
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019-20, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -110,8 +110,6 @@ err_t bli_gemmsup
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
// Pass in m and n reversed, which simulates a transposition of the
|
||||
@@ -124,8 +122,6 @@ err_t bli_gemmsup
|
||||
else // ukr_prefers_storage_of( c, ... )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) ) {
|
||||
@@ -269,7 +265,6 @@ err_t bli_gemmtsup
|
||||
else // ukr_prefers_storage_of( c, ... )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
if ( !bli_cntx_gemmt_sup_thresh_is_met( dt, n, k, cntx ) ) {
|
||||
@@ -325,4 +320,146 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2);
|
||||
}
|
||||
|
||||
err_t bli_syrksup
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2);
|
||||
|
||||
// Return early if small matrix handling is disabled at configure-time.
|
||||
#ifdef BLIS_DISABLE_SUP_HANDLING
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP is Disabled.");
|
||||
return BLIS_FAILURE;
|
||||
#endif
|
||||
|
||||
// Return early if this is a mixed-datatype computation.
|
||||
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP doesn't support Mixed datatypes.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
obj_t at_local;
|
||||
|
||||
// For syrk, the right-hand "B" operand is simply A^T.
|
||||
bli_obj_alias_to( a, &at_local );
|
||||
bli_obj_induce_trans( &at_local );
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, &at_local );
|
||||
|
||||
/*General stride is not yet supported in sup*/
|
||||
if(BLIS_XXX==stor_id) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP doesn't support general stride.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const dim_t n = bli_obj_width( c );
|
||||
trans_t transa = bli_obj_conjtrans_status( a );
|
||||
|
||||
//Don't use sup for currently unsupported storage types in cgemmsup
|
||||
if(bli_obj_is_scomplex(c) &&
|
||||
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
|
||||
|| ((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE))
|
||||
)){
|
||||
//printf(" syrksup: Returning with for un-supported storage types and conjugate property in csyrksup \n");
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for csyrk");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
//Don't use sup for currently unsupported storage types in zgemmsup
|
||||
if(bli_obj_is_dcomplex(c) &&
|
||||
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
|
||||
|| ((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE))
|
||||
)){
|
||||
//printf(" syrksup: Returning with for un-supported storage types and conjugate property in zsyrksup \n");
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for zsyrk.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Return early if a microkernel preference-induced transposition would
|
||||
// have been performed and shifted the dimensions outside of the space
|
||||
// of sup-handled problems.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
// Pass in m and n reversed, which simulates a transposition of the
|
||||
// entire operation pursuant to the microkernel storage preference.
|
||||
if ( !bli_cntx_syrk_sup_thresh_is_met( dt, n, k, cntx ) ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Trasposition results in sizes beyond SUP thresholds.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
}
|
||||
else // ukr_prefers_storage_of( c, ... )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
if ( !bli_cntx_syrk_sup_thresh_is_met( dt, n, k, cntx ) ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Sizes beyond SUP thresholds.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
#if 0
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
const dim_t tm = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
|
||||
const dim_t tn = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
|
||||
const dim_t tk = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
|
||||
|
||||
printf( "dims: %d %d %d (threshs: %d %d %d)\n",
|
||||
(int)m, (int)n, (int)k, (int)tm, (int)tn, (int)tk );
|
||||
#endif
|
||||
|
||||
// We've now ruled out the following two possibilities:
|
||||
// - the ukernel prefers the operation as-is, and the sup thresholds are
|
||||
// unsatisfied.
|
||||
// - the ukernel prefers a transposed operation, and the sup thresholds are
|
||||
// unsatisfied after taking into account the transposition.
|
||||
// This implies that the sup thresholds (at least one of them) are met.
|
||||
// and the small/unpacked handler should be called.
|
||||
// NOTE: The sup handler is free to enforce a stricter threshold regime
|
||||
// if it so chooses, in which case it can/should return BLIS_FAILURE.
|
||||
|
||||
// Query the small/unpacked handler from the context and invoke it.
|
||||
gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
|
||||
|
||||
return
|
||||
gemmtsup_fp
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
&at_local,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -54,3 +54,13 @@ err_t bli_gemmtsup
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
err_t bli_syrksup
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -335,6 +335,12 @@ BLIS_INLINE bool bli_cntx_gemmt_sup_thresh_is_met( num_t dt, dim_t n, dim_t k, c
|
||||
return bli_cntx_l3_sup_thresh_is_met( dt, n, n, k, cntx );
|
||||
}
|
||||
|
||||
// -- syrk specific function
|
||||
BLIS_INLINE bool bli_cntx_syrk_sup_thresh_is_met( num_t dt, dim_t n, dim_t k, cntx_t* cntx )
|
||||
{
|
||||
//copied gemm thresholds temporarily. These needs to be derived for syrk.
|
||||
return bli_cntx_l3_sup_thresh_is_met( dt, n, n, k, cntx );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user