mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
DGEMM: Added decision logic to choose between sup vs native for zen4 architecture
Details: - Added a new function for choosing between SUP and native implementation for a given size. - This function pointer is stored in cntx for zen4 config. - Divided total combinations of sizes into 3 categories: - one dimension is small - Two dimensions are small - All dimensions are small - Added different threshold conditions for each of the categories. AMD-Internal: [CPUPL-2755] Change-Id: Iae4bf96bb7c9bf9f68fd909fb757d7fe13bc6caf
This commit is contained in:
@@ -90,7 +90,9 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// Update the context with architecture specific threshold functions
|
||||
bli_cntx_set_l3_thresh_funcs
|
||||
(
|
||||
2,
|
||||
3,
|
||||
// GEMM
|
||||
BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4,
|
||||
// GEMMT
|
||||
BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen,
|
||||
// SYRK
|
||||
@@ -216,8 +218,8 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values. s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 1000, 380, 110 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 1000, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -134,10 +134,12 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//(RRC or CRC storage ids) to avoid rd kernels
|
||||
if(bli_is_double(dt))
|
||||
if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
|
||||
{
|
||||
dim_t m_pt = (m/bli_rntm_ways_for( BLIS_MC, rntm ));
|
||||
dim_t n_pt = (n/bli_rntm_ways_for( BLIS_NC, rntm ));
|
||||
@@ -153,7 +155,7 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
// Using the 1n kernel (B broadcast) gave better performance for sgemm
|
||||
// in single-thread scenario, given the number of n panels are
|
||||
// sufficiently larger than m panels.
|
||||
@@ -210,10 +212,12 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//(RRC or CRC storage ids) to avoid rd kernels
|
||||
if(bli_is_double(dt))
|
||||
if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
|
||||
{
|
||||
dim_t m_pt = (m/bli_rntm_ways_for( BLIS_NC, rntm ));
|
||||
dim_t n_pt = (n/bli_rntm_ways_for( BLIS_MC, rntm ));
|
||||
@@ -229,7 +233,7 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
if ( bli_is_float( dt ) && ( n_threads == 1 ) && ( use_pb == TRUE ) )
|
||||
{
|
||||
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
|
||||
@@ -3,5 +3,5 @@
|
||||
add_subdirectory(1)
|
||||
add_subdirectory(1m)
|
||||
add_subdirectory(3)
|
||||
|
||||
add_subdirectory(aocl_smart)
|
||||
|
||||
|
||||
6
kernels/zen4/aocl_smart/CMakeLists.txt
Normal file
6
kernels/zen4/aocl_smart/CMakeLists.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_aocl_smart.c
|
||||
)
|
||||
71
kernels/zen4/aocl_smart/bli_aocl_smart.c
Normal file
71
kernels/zen4/aocl_smart/bli_aocl_smart.c
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/* This function determines if we need to take SUP or native path
|
||||
for given matrix sizes for zen4 configuration.
|
||||
* Returns TRUE if the dimensions fall under SUP range
|
||||
* Returns FALSE if the dimensions fall under Native range
|
||||
*/
|
||||
bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx )
|
||||
{
|
||||
num_t dt = bli_obj_dt( c );
|
||||
|
||||
if( dt == BLIS_DOUBLE )
|
||||
{
|
||||
dim_t k = bli_obj_width_after_trans( a );
|
||||
dim_t m, n;
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
|
||||
{
|
||||
m = bli_obj_width(c);
|
||||
n = bli_obj_length(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
m = bli_obj_length( c );
|
||||
n = bli_obj_width( c );
|
||||
}
|
||||
// For skinny sizes where one/two dimensions are small
|
||||
if((m < 1000) || (n < 1000)) return TRUE;
|
||||
// For all combinations in small sizes
|
||||
if((m < 5000) && (n < 5000) && (k < 5000)) return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
else
|
||||
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
|
||||
}
|
||||
@@ -197,4 +197,13 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x1 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 )
|
||||
|
||||
// threshold functions
|
||||
bool bli_cntx_gemmsup_thresh_is_met_zen4
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user