DGEMM: Added decision logic to choose between sup vs native for zen4 architecture

Details:
- Added a new function for choosing between SUP and
  native implementation for a given size.
- This function pointer is stored in cntx for zen4 config.
- Divided total combinations of sizes into 3 categories:
  - one dimension is small
  - Two dimensions are small
  - All dimensions are small
- Added different threshold conditions for each of the
  categories.

AMD-Internal: [CPUPL-2755]
Change-Id: Iae4bf96bb7c9bf9f68fd909fb757d7fe13bc6caf
This commit is contained in:
Meghana Vankadari
2023-04-06 16:08:25 +00:00
parent e23765010d
commit 42d05a5aa0
7 changed files with 102 additions and 10 deletions

View File

@@ -90,7 +90,9 @@ void bli_cntx_init_zen4( cntx_t* cntx )
// Update the context with architecture specific threshold functions
bli_cntx_set_l3_thresh_funcs
(
2,
3,
// GEMM
BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4,
// GEMMT
BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen,
// SYRK
@@ -216,8 +218,8 @@ void bli_cntx_init_zen4( cntx_t* cntx )
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values. s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 1000, 380, 110 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 1000, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 );
// Initialize the context with the sup thresholds.

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -134,10 +134,12 @@ err_t bli_gemmsup_int
}
}
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
//Enable packing of B matrix for double data type when dims at per
//thread level are above caches and enable packing of A when transA
//(RRC or CRC storage ids) to avoid rd kernels
if(bli_is_double(dt))
if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
{
dim_t m_pt = (m/bli_rntm_ways_for( BLIS_MC, rntm ));
dim_t n_pt = (n/bli_rntm_ways_for( BLIS_NC, rntm ));
@@ -153,7 +155,7 @@ err_t bli_gemmsup_int
}
}
}
#endif
// Using the 1n kernel (B broadcast) gave better performance for sgemm
// in single-thread scenario, given the number of n panels are
// sufficiently larger than m panels.
@@ -210,10 +212,12 @@ err_t bli_gemmsup_int
}
}
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
//Enable packing of B matrix for double data type when dims at per
//thread level are above caches and enable packing of A when transA
//(RRC or CRC storage ids) to avoid rd kernels
if(bli_is_double(dt))
if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
{
dim_t m_pt = (m/bli_rntm_ways_for( BLIS_NC, rntm ));
dim_t n_pt = (n/bli_rntm_ways_for( BLIS_MC, rntm ));
@@ -229,7 +233,7 @@ err_t bli_gemmsup_int
}
}
}
#endif
if ( bli_is_float( dt ) && ( n_threads == 1 ) && ( use_pb == TRUE ) )
{
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,

View File

@@ -3,5 +3,5 @@
add_subdirectory(1)
add_subdirectory(1m)
add_subdirectory(3)
add_subdirectory(aocl_smart)

View File

@@ -0,0 +1,6 @@
##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
target_sources("${PROJECT_NAME}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/bli_aocl_smart.c
)

View File

@@ -0,0 +1,71 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
/* This function determines if we need to take SUP or native path
for given matrix sizes for zen4 configuration.
* Returns TRUE if the dimensions fall under SUP range
* Returns FALSE if the dimensions fall under Native range
*/
bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx )
{
num_t dt = bli_obj_dt( c );
if( dt == BLIS_DOUBLE )
{
dim_t k = bli_obj_width_after_trans( a );
dim_t m, n;
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
{
m = bli_obj_width(c);
n = bli_obj_length(c);
}
else
{
m = bli_obj_length( c );
n = bli_obj_width( c );
}
// For skinny sizes where one/two dimensions are small
if((m < 1000) || (n < 1000)) return TRUE;
// For all combinations in small sizes
if((m < 5000) && (n < 5000) && (k < 5000)) return TRUE;
return FALSE;
}
else
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
}

View File

@@ -197,4 +197,13 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x1 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x4 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x3 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x2 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 )
// threshold functions
bool bli_cntx_gemmsup_thresh_is_met_zen4
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
);