mirror of
https://github.com/amd/blis.git
synced 2026-05-12 18:15:37 +00:00
Implemented AVX-512 based 12x4 m-variant SUP kernels for ZGEMM
- Implemented 12x4m column preferential SUP kernels(main and fringe
cases). The main kernel dimension is 12x4, and the associated fringe
kernel dimensions are : 12x3m, 12x2m, 12x1m
8x4, 8x3, 8x2, 8x1
4x4, 4x3, 4x2, 4x1
2x4, 2x3, 2x2, 2x1.
- Included in-register transposition support for C, thus extending
the storage scheme supports to CCC, CCR, RCC and RCR inside the
milli-kernel.
- Integrated conditional packing of A onto the SUP front end for
dcomplex datatype. This redirects RRC and CRC storage schemes
onto the preceding set of SUP kernels through storage scheme
transformation(RCC and CCC respectively).
- Updated the zen4 context file with the new set of SUP kernels, to
get enabled appropriately. Furthermore, the context file was updated
with the AVX-2 dotxv signatures for dcomplex datatype. This redirects
the fringe cases of type 1x? to the pre-existing AVX-2 GEMV routines.
- Added C prefetching onto L2-cache, and an unroll factor of 4 for the
k loop in all the kernels.
- Work in progress to include conjugate support and input spectrum
extension for the AVX-512 SUP kernels. The current thresholds in zen4
context is the same as that of the zen3 thresholds for ZGEMM SUP.
AMD-Internal: [CPUPL-3122]
Change-Id: If40bc4409c6eb188765329508cf1f24c0eb12d1e
This commit is contained in:
committed by
Vignesh Balasubramanian
parent
75c5fd1b66
commit
775ce1f13c
@@ -334,7 +334,7 @@ elseif(${ENABLE_SIMD_FLAGS} MATCHES "SSE2")
|
||||
add_definitions(/arch:SSE2)
|
||||
endif()
|
||||
|
||||
if(${TARGET_ARCH} STREQUAL zen4 OR
|
||||
if(${TARGET_ARCH} STREQUAL zen4 OR
|
||||
${TARGET_ARCH} STREQUAL amdzen)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_amaxv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
@@ -355,6 +355,7 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_cv_zen_z12x4m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
|
||||
@@ -138,7 +138,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
27,
|
||||
28,
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
@@ -165,6 +165,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
@@ -193,7 +194,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
//
|
||||
// These are reference block sizes and may be overridden based on
|
||||
// number of threads used at runtime.
|
||||
|
||||
|
||||
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
@@ -267,24 +268,24 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
||||
6, 9, 3, 3 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 12,
|
||||
6, 9, 3, 12 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 72, 72, 36 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 72, 72, 48 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8064, 4080, 2040, 1020 );
|
||||
|
||||
@@ -306,11 +307,11 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
* Override the block sizes in the context to the block sizes used
|
||||
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
|
||||
* GEMM kernels are AVX512 based and uses different block sizes.
|
||||
*
|
||||
*
|
||||
* This function should be called in TRSM path before performing
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
|
||||
*/
|
||||
void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
|
||||
@@ -353,7 +354,7 @@ void bli_zen4_override_gemm_blkszs (cntx_t* cntx)
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 144, 72, 36 );
|
||||
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
@@ -387,16 +388,16 @@ void bli_zen4_override_gemm_blkszs (cntx_t* cntx)
|
||||
*
|
||||
* This function should be called to restore the block sizes to there
|
||||
* default values if they where overriden by calling
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* TRSM path.
|
||||
*
|
||||
*
|
||||
*/
|
||||
void bli_zen4_restore_default_blkszs (cntx_t* cntx)
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
|
||||
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
|
||||
@@ -64,11 +64,11 @@
|
||||
* Override the block sizes in the context to the block sizes used
|
||||
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
|
||||
* GEMM kernels are AVX512 based and uses different block sizes.
|
||||
*
|
||||
*
|
||||
* This function should be called in TRSM path before performing
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
|
||||
@@ -80,9 +80,9 @@ BLIS_EXPORT_BLIS void bli_zen4_override_gemm_blkszs (cntx_t* cntx);
|
||||
*
|
||||
* This function should be called to restore the block sizes to there
|
||||
* default values if they where overriden by calling
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* TRSM path.
|
||||
*
|
||||
*
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
|
||||
|
||||
|
||||
@@ -116,18 +116,30 @@ err_t bli_gemmsup
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
|
||||
|
||||
if((bli_arch_query_id() == BLIS_ARCH_ZEN4) && (bli_obj_dt(a) == BLIS_DOUBLE))
|
||||
if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
|
||||
{
|
||||
// override the existing blocksizes with 24x8 specific ones.
|
||||
// This can be removed when we use same blocksizes and function pointers
|
||||
// for all level-3 SUP routines.
|
||||
bli_zen4_override_gemm_blkszs(&cntx_gemm);
|
||||
|
||||
// Pack A to avoid RD kernels.
|
||||
if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
|
||||
if( bli_obj_dt(a) == BLIS_DOUBLE )
|
||||
{
|
||||
bli_rntm_set_pack_a(1, rntm);//packa
|
||||
// override the existing blocksizes with 24x8 specific ones.
|
||||
// This can be removed when we use same blocksizes and function pointers
|
||||
// for all level-3 SUP routines.
|
||||
bli_zen4_override_gemm_blkszs(&cntx_gemm);
|
||||
|
||||
// Pack A to avoid RD kernels.
|
||||
if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
|
||||
{
|
||||
bli_rntm_set_pack_a(1, rntm);//packa
|
||||
}
|
||||
}
|
||||
else if( bli_obj_dt(a) == BLIS_DCOMPLEX )
|
||||
{
|
||||
// Pack A to avoid RD kernels.
|
||||
if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
|
||||
{
|
||||
bli_rntm_set_pack_a(1, rntm);//packa
|
||||
}
|
||||
}
|
||||
else ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -66,7 +66,6 @@ err_t bli_gemmsup_int
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
const bool col_pref = !row_pref;
|
||||
@@ -95,7 +94,7 @@ err_t bli_gemmsup_int
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
// calculate number of micropanels in m and n dimensions and
|
||||
// recalculate the automatic thread factorization based on these number of micropanels
|
||||
// recalculate the automatic thread factorization based on these number of micropanels
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
@@ -129,11 +128,14 @@ err_t bli_gemmsup_int
|
||||
if (bli_is_dcomplex(dt) && (n_threads == 1))
|
||||
{
|
||||
if ((m > 55) && (k > 55) && (n > 55))
|
||||
bli_rntm_set_pack_b(1, rntm);//packb
|
||||
{
|
||||
if ( row_pref )
|
||||
bli_rntm_set_pack_b(1, rntm);//packb
|
||||
}
|
||||
}
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//(RRC or CRC storage ids) to avoid rd kernels
|
||||
if(bli_is_double(dt))
|
||||
{
|
||||
@@ -146,7 +148,7 @@ err_t bli_gemmsup_int
|
||||
{
|
||||
bli_rntm_set_pack_b(1, rntm);//packb
|
||||
|
||||
if(stor_id==BLIS_RRC || stor_id==BLIS_CRC)
|
||||
if(( stor_id==BLIS_RRC ) || ( stor_id==BLIS_CRC ))
|
||||
bli_rntm_set_pack_a(1, rntm);//packa
|
||||
}
|
||||
}
|
||||
@@ -191,22 +193,25 @@ err_t bli_gemmsup_int
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
/* Enable packing for B matrix for higher sizes. Note that pack A
|
||||
/* Enable packing for B matrix for higher sizes. Note that pack A
|
||||
* becomes pack B inside var2m because this is transpose case*/
|
||||
if(bli_is_float(dt) && (n_threads==1)) {
|
||||
if((m > 240) && (k > 240) && (n > 240))
|
||||
bli_rntm_set_pack_a( 1, rntm );//packb
|
||||
}
|
||||
|
||||
/*Enable packing of A matrix for complex data type*/
|
||||
//Enable packing of A matrix for complex data type
|
||||
if (bli_is_dcomplex(dt) && (n_threads == 1))
|
||||
{
|
||||
if ((m > 55) && (k > 55) && (n > 55))
|
||||
bli_rntm_set_pack_a(1, rntm);//packb
|
||||
{
|
||||
if ( row_pref )
|
||||
bli_rntm_set_pack_a(1, rntm);//packb
|
||||
}
|
||||
}
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
//(RRC or CRC storage ids) to avoid rd kernels
|
||||
if(bli_is_double(dt))
|
||||
{
|
||||
@@ -215,16 +220,16 @@ err_t bli_gemmsup_int
|
||||
|
||||
if(k > 120)
|
||||
{
|
||||
if(((m_pt > 320) && (n_pt > 120)) || ((m_pt > 120) && (n_pt > 320)))
|
||||
if(((m_pt > 320) && (n_pt > 120)) || ((m_pt > 120) && (n_pt > 320)))
|
||||
{
|
||||
bli_rntm_set_pack_a(1, rntm);//packb
|
||||
|
||||
if(stor_id==BLIS_RRC || stor_id==BLIS_CRC)
|
||||
if(( stor_id==BLIS_RRC ) || ( stor_id==BLIS_CRC ))
|
||||
bli_rntm_set_pack_b(1, rntm);//packa
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ( bli_is_float( dt ) && ( n_threads == 1 ) && ( use_pb == TRUE ) )
|
||||
{
|
||||
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
|
||||
@@ -198,7 +198,7 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( dt == BLIS_DOUBLE )
|
||||
if ( ( dt == BLIS_DOUBLE ) || ( dt == BLIS_DCOMPLEX ) )
|
||||
{
|
||||
// The optimizations are only done for CRC and RRC storage schemes to avoid RD kernels.
|
||||
// Optimizations for other storage schemes is yet to be done.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen_16x14.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen_16x14.c
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.h
|
||||
@@ -11,6 +11,7 @@ target_sources("${PROJECT_NAME}"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_cv_zen4_z12x4m.c
|
||||
)
|
||||
|
||||
add_subdirectory(d24x8)
|
||||
|
||||
5688
kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c
Normal file
5688
kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -166,3 +166,23 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)
|
||||
|
||||
// Zgemm sup CV kernels
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x4m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x3m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x2m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x1m )
|
||||
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x1 )
|
||||
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x1 )
|
||||
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 )
|
||||
Reference in New Issue
Block a user