Added sup functionality for SYRK

Details:
- Added bli_syrksup function that internally uses gemmt implementation.
- Modified OAPI of syrk to call SUP before proceeding to the
  conventional implementation.
- Copied gemmsup threshold function for syrk temporarily. Thresholds are
  yet to be derived for syrk.

Change-Id: I751c6bd62bc76a3e4717f77c5cb33f19b759151d
This commit is contained in:
Meghana Vankadari
2021-04-29 12:32:34 +05:30
parent b239a5aee7
commit 1303732e83
4 changed files with 236 additions and 10 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -306,9 +306,47 @@ void PASTEMAC(opname,EX_SUF) \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If alpha or A or B has a zero dimension, \
* scale C by beta and return early. */ \
\
if( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) ) \
{ \
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If the rntm is non-NULL, it may indicate that we should forgo SUP handling altogether. */ \
bool enable_sup = TRUE; \
if( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler.
* If it finds that the problem does not fall within the
* thresholds that define "small", or for some other reason
* decides not to use the small/unpacked implementation,
* the function returns with BLIS_FAILURE, which causes excution
* to proceed forward towards conventional implementation, */ \
\
err_t result = PASTEMAC(opname, sup) ( alpha, a, beta, c, cntx, rntm ); \
if( result == BLIS_SUCCESS ) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
@@ -330,9 +368,44 @@ void PASTEMAC(opname,EX_SUF) \
} \
}
GENFRONT( herk )
GENFRONT( syrk )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
chooser function and proceed directly with native execution, which is
where mixed datatype support will be implemented (if at all). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( alpha, a, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \
} \
}
GENFRONT(herk)
#undef GENFRONT
#define GENFRONT( opname ) \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019-20, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -110,8 +110,6 @@ err_t bli_gemmsup
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
// Pass in m and n reversed, which simulates a transposition of the
@@ -124,8 +122,6 @@ err_t bli_gemmsup
else // ukr_prefers_storage_of( c, ... )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) ) {
@@ -269,7 +265,6 @@ err_t bli_gemmtsup
else // ukr_prefers_storage_of( c, ... )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_gemmt_sup_thresh_is_met( dt, n, k, cntx ) ) {
@@ -325,4 +320,146 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2);
}
err_t bli_syrksup
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2);
// Return early if small matrix handling is disabled at configure-time.
#ifdef BLIS_DISABLE_SUP_HANDLING
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP is Disabled.");
return BLIS_FAILURE;
#endif
// Return early if this is a mixed-datatype computation.
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
{
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP doesn't support Mixed datatypes.");
return BLIS_FAILURE;
}
obj_t at_local;
// For syrk, the right-hand "B" operand is simply A^T.
bli_obj_alias_to( a, &at_local );
bli_obj_induce_trans( &at_local );
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, &at_local );
/*General stride is not yet supported in sup*/
if(BLIS_XXX==stor_id) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP doesn't support general stride.");
return BLIS_FAILURE;
}
const dim_t n = bli_obj_width( c );
trans_t transa = bli_obj_conjtrans_status( a );
//Don't use sup for currently unsupported storage types in cgemmsup
if(bli_obj_is_scomplex(c) &&
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
|| ((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE))
)){
//printf(" syrksup: Returning with for un-supported storage types and conjugate property in csyrksup \n");
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for csyrk");
return BLIS_FAILURE;
}
//Don't use sup for currently unsupported storage types in zgemmsup
if(bli_obj_is_dcomplex(c) &&
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
|| ((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE))
)){
//printf(" syrksup: Returning with for un-supported storage types and conjugate property in zsyrksup \n");
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for zsyrk.");
return BLIS_FAILURE;
}
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Return early if a microkernel preference-induced transposition would
// have been performed and shifted the dimensions outside of the space
// of sup-handled problems.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
{
const num_t dt = bli_obj_dt( c );
const dim_t k = bli_obj_width_after_trans( a );
// Pass in m and n reversed, which simulates a transposition of the
// entire operation pursuant to the microkernel storage preference.
if ( !bli_cntx_syrk_sup_thresh_is_met( dt, n, k, cntx ) ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Trasposition results in sizes beyond SUP thresholds.");
return BLIS_FAILURE;
}
}
else // ukr_prefers_storage_of( c, ... )
{
const num_t dt = bli_obj_dt( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_syrk_sup_thresh_is_met( dt, n, k, cntx ) ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Sizes beyond SUP thresholds.");
return BLIS_FAILURE;
}
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#if 0
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
const dim_t tm = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
const dim_t tn = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
const dim_t tk = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
printf( "dims: %d %d %d (threshs: %d %d %d)\n",
(int)m, (int)n, (int)k, (int)tm, (int)tn, (int)tk );
#endif
// We've now ruled out the following two possibilities:
// - the ukernel prefers the operation as-is, and the sup thresholds are
// unsatisfied.
// - the ukernel prefers a transposed operation, and the sup thresholds are
// unsatisfied after taking into account the transposition.
// This implies that the sup thresholds (at least one of them) are met.
// and the small/unpacked handler should be called.
// NOTE: The sup handler is free to enforce a stricter threshold regime
// if it so chooses, in which case it can/should return BLIS_FAILURE.
// Query the small/unpacked handler from the context and invoke it.
gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
return
gemmtsup_fp
(
alpha,
a,
&at_local,
beta,
c,
cntx,
rntm
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2);
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -54,3 +54,13 @@ err_t bli_gemmtsup
rntm_t* rntm
);
err_t bli_syrksup
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -335,6 +335,12 @@ BLIS_INLINE bool bli_cntx_gemmt_sup_thresh_is_met( num_t dt, dim_t n, dim_t k, c
return bli_cntx_l3_sup_thresh_is_met( dt, n, n, k, cntx );
}
// -- syrk specific function
BLIS_INLINE bool bli_cntx_syrk_sup_thresh_is_met( num_t dt, dim_t n, dim_t k, cntx_t* cntx )
{
//copied gemm thresholds temporarily. These needs to be derived for syrk.
return bli_cntx_l3_sup_thresh_is_met( dt, n, n, k, cntx );
}
// -----------------------------------------------------------------------------