mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Added DTRSM small RUNN/RLTN variant AVX512 kernels
- 8x8 kernels are used for DTRSM SMALL - Matrix A(a10) is packed for GEMM operations. - Packed martix A will be re-used in all the col-block along N-dimension. - Diagonal elements of A matrix are packed(a11) for TRSM operations. - Implemented fringe cases with following block sizes 8x8, 8x4, 8x3, 8x2, 8x1 4x8, 4x4, 4x3, 4x2, 4x1 3x8, 3x4, 3x3, 3x2, 3x1 2x8, 2x4, 2x3, 2x2, 2x1 1x8, 1x4, 1x3, 1x2, 1x1 AMD-Internal: [CPUPL-2745] Change-Id: I6a174e7f88a4c2c5778052525879552a1e82f6ad
This commit is contained in:
@@ -342,6 +342,7 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_dgemm_skx_asm_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_trsm_small_AVX512.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019-2023, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -54,3 +54,29 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* restrict cntx \
|
||||
);
|
||||
|
||||
|
||||
|
||||
#define TRSMSUP_PROT( opname ) \
|
||||
\
|
||||
err_t PASTEMAC0(opname) \
|
||||
( \
|
||||
side_t side, \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
cntx_t* cntx, \
|
||||
cntl_t* cntl, \
|
||||
bool is_parallel \
|
||||
);
|
||||
|
||||
#define TRSMSUP_KER_PROT( ch, opname ) \
|
||||
\
|
||||
BLIS_INLINE err_t PASTEMAC(ch,opname) \
|
||||
( \
|
||||
obj_t* AlphaObj, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
cntx_t* cntx, \
|
||||
cntl_t* cntl \
|
||||
);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -953,14 +953,66 @@ void dtrsm_blis_impl
|
||||
(is_parallel && (m0+n0)<320))
|
||||
{
|
||||
err_t status;
|
||||
status = bli_trsm_small(
|
||||
blis_side,
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
NULL,
|
||||
NULL,
|
||||
is_parallel);
|
||||
|
||||
// Query the architecture ID
|
||||
arch_t id = bli_arch_query_id();
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
bool uplo, transa;
|
||||
#endif
|
||||
switch(id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// check if variant is RUN[N/U] or RLT[N/U]
|
||||
// this is a temporary fix, will be removed when all variants are added
|
||||
|
||||
// for n < 200 avx2 kernels are performing better, but if
|
||||
// n is a multiple of 8 then there will be no fringe case for avx512,
|
||||
// in such cases avx512 kernels will perform better.
|
||||
uplo = bli_obj_is_upper(&ao);
|
||||
transa = bli_obj_has_trans(&ao);
|
||||
if(( ((blis_side == BLIS_RIGHT) && (uplo == true) && (transa == false)) ||
|
||||
((blis_side == BLIS_RIGHT) && (uplo == false) && (transa == true))) &&
|
||||
((n0 > 400) && (m0 > 50)))
|
||||
{
|
||||
status = bli_trsm_small_AVX512(
|
||||
blis_side,
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
NULL,
|
||||
NULL,
|
||||
is_parallel);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = bli_trsm_small(
|
||||
blis_side,
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
NULL,
|
||||
NULL,
|
||||
is_parallel);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
status = bli_trsm_small(
|
||||
blis_side,
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
NULL,
|
||||
NULL,
|
||||
is_parallel);
|
||||
break;
|
||||
default:
|
||||
status = BLIS_NOT_YET_IMPLEMENTED;
|
||||
}
|
||||
if (status == BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
|
||||
|
||||
@@ -5,4 +5,5 @@ target_sources("${PROJECT_NAME}"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen_16x14.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen_16x14.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_32x6.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
|
||||
)
|
||||
|
||||
4333
kernels/zen4/3/bli_trsm_small_AVX512.c
Normal file
4333
kernels/zen4/3/bli_trsm_small_AVX512.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -67,3 +67,10 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_8x32m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x32m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x32m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x32m )
|
||||
|
||||
|
||||
TRSMSUP_PROT(trsm_small_AVX512)
|
||||
TRSMSUP_KER_PROT( d, trsm_small_AutXB_AlXB_AVX512 )
|
||||
TRSMSUP_KER_PROT( d, trsm_small_XAltB_XAuB_AVX512 )
|
||||
TRSMSUP_KER_PROT( d, trsm_small_XAutB_XAlB_AVX512 )
|
||||
TRSMSUP_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 )
|
||||
|
||||
Reference in New Issue
Block a user