Added DTRSM small RUNN/RLTN variant AVX512 kernels

- 8x8 kernels are used for DTRSM SMALL
- Matrix A(a10) is packed for GEMM operations.
- Packed martix A will be re-used in all the col-block
  along N-dimension.
- Diagonal elements of A matrix are packed(a11) for
  TRSM operations.
- Implemented fringe cases with following block sizes
   8x8, 8x4, 8x3, 8x2, 8x1
   4x8, 4x4, 4x3, 4x2, 4x1
   3x8, 3x4, 3x3, 3x2, 3x1
   2x8, 2x4, 2x3, 2x2, 2x1
   1x8, 1x4, 1x3, 1x2, 1x1

AMD-Internal: [CPUPL-2745]

Change-Id: I6a174e7f88a4c2c5778052525879552a1e82f6ad
This commit is contained in:
Shubham
2023-01-17 22:59:35 +05:30
committed by Shubham Sharma
parent 63ee4c5e4c
commit 18569b42ee
6 changed files with 4430 additions and 10 deletions

View File

@@ -342,6 +342,7 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_dgemm_skx_asm_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_trsm_small_AVX512.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019-2023, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -54,3 +54,29 @@ void PASTEMAC(ch,opname) \
cntx_t* restrict cntx \
);
#define TRSMSUP_PROT( opname ) \
\
err_t PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
cntl_t* cntl, \
bool is_parallel \
);
#define TRSMSUP_KER_PROT( ch, opname ) \
\
BLIS_INLINE err_t PASTEMAC(ch,opname) \
( \
obj_t* AlphaObj, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
cntl_t* cntl \
);

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -953,14 +953,66 @@ void dtrsm_blis_impl
(is_parallel && (m0+n0)<320))
{
err_t status;
status = bli_trsm_small(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL,
is_parallel);
// Query the architecture ID
arch_t id = bli_arch_query_id();
#if defined(BLIS_KERNELS_ZEN4)
bool uplo, transa;
#endif
switch(id)
{
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// check if variant is RUN[N/U] or RLT[N/U]
// this is a temporary fix, will be removed when all variants are added
// for n < 200 avx2 kernels are performing better, but if
// n is a multiple of 8 then there will be no fringe case for avx512,
// in such cases avx512 kernels will perform better.
uplo = bli_obj_is_upper(&ao);
transa = bli_obj_has_trans(&ao);
if(( ((blis_side == BLIS_RIGHT) && (uplo == true) && (transa == false)) ||
((blis_side == BLIS_RIGHT) && (uplo == false) && (transa == true))) &&
((n0 > 400) && (m0 > 50)))
{
status = bli_trsm_small_AVX512(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL,
is_parallel);
}
else
{
status = bli_trsm_small(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL,
is_parallel);
}
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
status = bli_trsm_small(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL,
is_parallel);
break;
default:
status = BLIS_NOT_YET_IMPLEMENTED;
}
if (status == BLIS_SUCCESS)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);

View File

@@ -5,4 +5,5 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen_16x14.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen_16x14.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_32x6.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
)

File diff suppressed because it is too large Load Diff

View File

@@ -67,3 +67,10 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_8x32m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x32m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x32m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x32m )
TRSMSUP_PROT(trsm_small_AVX512)
TRSMSUP_KER_PROT( d, trsm_small_AutXB_AlXB_AVX512 )
TRSMSUP_KER_PROT( d, trsm_small_XAltB_XAuB_AVX512 )
TRSMSUP_KER_PROT( d, trsm_small_XAutB_XAlB_AVX512 )
TRSMSUP_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 )