Added AVX-512 based col-preferred kernels for DGEMM with optional pack framework

- Main kernel is of size 24x8 and the associated fringe kernels
  added are
  - 24x7m, 24x6m, 24x5m, 24x4m, 24x3m, 24x2m, 24x1m
  - 24x8,  24x7,  24x6,  24x5,  24x4,  24x3,  24x2,  24x1
  - 16x8,  16x7,  16x6,  16x5,  16x4,  16x3,  16x2,  16x1
  -  8x8,   8x7,   8x6,   8x5,   8x4,   8x3,   8x2,   8x1

- For fringe kernels, 24x? kernel handles 16 < m_remainder <  24
                      16x? kernel handles 8  < m_remainder <= 16
                       8x? kernel handles 0  < m_remainder <= 8
- Added a function 'bli_zen4_override_gemm_blkszs' to override
  blocksizes and kernels to be used for SUP for supported storage
  schemes.
- Updated the zen4 config to enable these kernels in zen4 path.
- Thresholds are yet to be derived.
- Updated CMakeLists.txt with DGEMM SUP kernels for windows build.

Kernel-specific details:
- K-loop is unrolled by 8 times to facilitate prefetch of B.
- For every load of one column of A, the corresponding column in
  next panel of A is prefetched with T1 hint.
- One column of C is prefetched with T0 hint per iteration of LOOP2.
- TAIL_NITER is derived to be 3.
- For every unroll of k-loop, one row of B is prefetched with T0 hint.
- C-prefetching for row-storage is yet to be added.
- B-prefetching for col-storage is yet to be added.
- Support for C transpose is yet to added.

AMD-Internal: [CPUPL-2755], [CPUPL-2409]
Change-Id: Ie240c893469032dc2271cbfe00cceccfe6c4ea48
This commit is contained in:
Meghana Vankadari
2023-01-02 06:24:04 +00:00
parent 253ceffb0f
commit 31a4203c32
19 changed files with 28089 additions and 15 deletions

View File

@@ -354,6 +354,15 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")

View File

@@ -240,7 +240,9 @@ void bli_cntx_init_zen4( cntx_t* cntx )
bli_cntx_set_l3_sup_kers
(
30,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
// 6x8 kernels will still be used for gemmt/syrk sup
// In case of gemm, a special function will be used to override
// these blocksizes and functions with 24x8-specific ones.
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
@@ -334,6 +336,50 @@ void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
);
}
// Since gemmt/syrk SUP requires block sizes to be 6x8,
// We use this function to override blocksizes and kernel functions
// with AVX-512 ones for DGEMM only.
// This function needs to be removed once checks are added around
// 6x8-specific gemmt code.
void bli_zen4_override_gemm_blkszs (cntx_t* cntx)
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 144, 72, 36 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_l3_sup_blkszs
(
4,
// level-3
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
bli_cntx_set_l3_sup_kers
(
8,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
cntx
);
}
/*
* Restore the block sizes to default values needed for zen4 context.
*

View File

@@ -73,6 +73,8 @@
*/
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
BLIS_EXPORT_BLIS void bli_zen4_override_gemm_blkszs (cntx_t* cntx);
/*
* Restore the block sizes to default values needed for zen4 context.
*

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -73,6 +73,7 @@ err_t bli_gemmsup
trans_t transa = bli_obj_conjtrans_status( a );
trans_t transb = bli_obj_conjtrans_status( b );
//Don't use sup for currently unsupported storage types in cgemmsup
if(bli_obj_is_scomplex(c) &&
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
@@ -95,17 +96,49 @@ err_t bli_gemmsup
}
// Obtain a valid (native) context from the gks if necessary.
// Obtain a valid context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Creating a local copy of cntx inorder to store overrided blocksizes
// and kernel fucntion pointers.
// This can be removed when we use same blocksizes and function pointers
// for all level-3 SUP routines.
cntx_t cntx_gemm = *cntx;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
if((bli_arch_query_id() == BLIS_ARCH_ZEN4) && (bli_obj_dt(a) == BLIS_DOUBLE))
{
// This check will be removed once transpose and store of C matrix inside
// the kernel is supported.
if((stor_id == BLIS_RCC || stor_id == BLIS_CRR || stor_id == BLIS_RRC))
{
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for dgemm.");
return BLIS_FAILURE;
}
// override the existing blocksizes with 24x8 specific ones.
// This can be removed when we use same blocksizes and function pointers
// for all level-3 SUP routines.
bli_zen4_override_gemm_blkszs(&cntx_gemm);
// Pack A to avoid RD kernels.
if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
{
bli_rntm_set_pack_a(1, rntm);//packa
}
}
#endif
#ifdef AOCL_DYNAMIC
// Calculating optimal nt and corresponding factorization (ic,jc) here, so
// as to determine the matrix dimensions (A - m, B - n) per thread. This
@@ -158,7 +191,7 @@ err_t bli_gemmsup
b,
beta,
c,
cntx,
&cntx_gemm,
rntm
);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -66,6 +66,16 @@ err_t bli_gemmsup_int
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool col_pref = !row_pref;
// For row-preferred kernels, rrr_rrc_rcr_crr becomes primary
// For col-preferred kernels, rcc_crc_ccr_ccc becomes primary
const bool is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) ||
( col_pref && is_rcc_crc_ccr_ccc );
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
@@ -78,12 +88,11 @@ err_t bli_gemmsup_int
return BLIS_FAILURE;
}
if ( is_rrr_rrc_rcr_crr )
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
// - Currently only row-preferential kernels are only supported.
// calculate number of micropanels in m and n dimensions and
// recalculate the automatic thread factorization based on these number of micropanels
@@ -164,7 +173,6 @@ err_t bli_gemmsup_int
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
// - Currently only row-preferential kernels are only supported.
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -198,9 +198,28 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
}
else
{
//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" );
bli_abort();
if ( dt == BLIS_DOUBLE )
{
// The optimizations are only done for CRC and RRC storage schemes to avoid RD kernels.
// Optimizations for other storage schemes is yet to be done.
if ( packa )
{
if( *eff_id == BLIS_CRC )
{
*eff_id = BLIS_CCC;
}
else if ( *eff_id == BLIS_RRC )
{
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_RCC;
}
}
}
else
{
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels for S, C and Z datatypes.\n" );
bli_abort();
}
}
}

View File

@@ -11,4 +11,4 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
)
add_subdirectory(sup)
add_subdirectory(sup)

View File

@@ -10,4 +10,7 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.h
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c
)
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c
)
add_subdirectory(d24x8)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
target_sources("${PROJECT_NAME}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx4.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx5.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -118,4 +118,47 @@ TRSMSMALL_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 )
#ifdef BLIS_ENABLE_OPENMP
TRSMSMALL_PROT(trsm_small_mt_AVX512)
#endif
#endif
// Dgemm sup RV kernels
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)