AVX-512 based col-preferred kernels for ZGEMM in native path

- Kernel block size is 12x4
- Updated the zen4 config to enable these kernels in zen4 path.
- Tuned MC,KC,NC for better performance for m/n/k size > 500
- Updated CMakeLists.txt with ZGEMM kernels for windows build.

Kernel supports:
1. Preload and prebroadcast of A and B
2. Prefecth of C Matrix
3. K loop is sub divided in to multiple loops to maintain distance between c prefetchs.
4. Special case when alpha/beta imag component is zero
5. Row/Col/General stride of Matrix C

AMD-Internal: [CPUPL-2998]
Change-Id: I62e3c352d475b1add3f43270805fbcee00e2e440
This commit is contained in:
Mangala V
2023-01-03 03:21:23 +05:30
committed by Mangala V
parent 04e091fdca
commit 245fdf072c
5 changed files with 1093 additions and 7 deletions

View File

@@ -345,6 +345,7 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_dgemm_skx_asm_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_trsm_small_AVX512.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.h PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512)

View File

@@ -41,12 +41,12 @@
#define BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs) \
/* s d c z */ \
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 3 ); \
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 12 ); \
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 6, 8, 4 ); \
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 128, 144, 18 ); \
bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 566, \
480, 320, 256, 566 ); \
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 4002, 4080, 256 ); \
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 128, 144, 60 ); \
bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 512, \
480, 320, 256, 160 ); \
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 4002, 4080, 2004 ); \
\
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); \
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \
@@ -70,9 +70,10 @@ void bli_cntx_init_zen4( cntx_t* cntx )
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_32x6, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
/*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE,
// Different GEMM kernels are used for TRSM for zen4 architecture
// Different GEMM kernels are used for TRSM for zen4 architecture
BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,

View File

@@ -9,6 +9,7 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_32x6.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_8x24.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_12x4.c
)
add_subdirectory(sup)

File diff suppressed because it is too large Load Diff

View File

@@ -55,6 +55,7 @@ PACKM_KER_PROT( dcomplex, z, packm_zen4_asm_4xk )
// native dgemm kernel
GEMM_UKR_PROT( double, d, gemm_zen4_asm_32x6 )
GEMM_UKR_PROT( double, d, gemm_zen4_asm_8x24 )
GEMM_UKR_PROT( dcomplex, z, gemm_zen4_asm_12x4 )
//sgemm rv sup
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x64m_avx512 )