mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added AVX-512 based col-preferred kernels for DGEMM with optional pack framework
- Main kernel is of size 24x8 and the associated fringe kernels
added are
- 24x7m, 24x6m, 24x5m, 24x4m, 24x3m, 24x2m, 24x1m
- 24x8, 24x7, 24x6, 24x5, 24x4, 24x3, 24x2, 24x1
- 16x8, 16x7, 16x6, 16x5, 16x4, 16x3, 16x2, 16x1
- 8x8, 8x7, 8x6, 8x5, 8x4, 8x3, 8x2, 8x1
- For fringe kernels, 24x? kernel handles 16 < m_remainder < 24
16x? kernel handles 8 < m_remainder <= 16
8x? kernel handles 0 < m_remainder <= 8
- Added a function 'bli_zen4_override_gemm_blkszs' to override
blocksizes and kernels to be used for SUP for supported storage
schemes.
- Updated the zen4 config to enable these kernels in zen4 path.
- Thresholds are yet to be derived.
- Updated CMakeLists.txt with DGEMM SUP kernels for windows build.
Kernel-specific details:
- K-loop is unrolled by 8 times to facilitate prefetch of B.
- For every load of one column of A, the corresponding column in
next panel of A is prefetched with T1 hint.
- One column of C is prefetched with T0 hint per iteration of LOOP2.
- TAIL_NITER is derived to be 3.
- For every unroll of k-loop, one row of B is prefetched with T0 hint.
- C-prefetching for row-storage is yet to be added.
- B-prefetching for col-storage is yet to be added.
- Support for C transpose is yet to added.
AMD-Internal: [CPUPL-2755], [CPUPL-2409]
Change-Id: Ie240c893469032dc2271cbfe00cceccfe6c4ea48
This commit is contained in:
@@ -354,6 +354,15 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")
|
||||
|
||||
@@ -240,7 +240,9 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
30,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
// 6x8 kernels will still be used for gemmt/syrk sup
|
||||
// In case of gemm, a special function will be used to override
|
||||
// these blocksizes and functions with 24x8-specific ones.
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
@@ -334,6 +336,50 @@ void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Since gemmt/syrk SUP requires block sizes to be 6x8,
|
||||
// We use this function to override blocksizes and kernel functions
|
||||
// with AVX-512 ones for DGEMM only.
|
||||
// This function needs to be removed once checks are added around
|
||||
// 6x8-specific gemmt code.
|
||||
void bli_zen4_override_gemm_blkszs (cntx_t* cntx)
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 3,
|
||||
9, 9, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 144, 72, 36 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
4,
|
||||
// level-3
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
8,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Restore the block sizes to default values needed for zen4 context.
|
||||
*
|
||||
|
||||
@@ -73,6 +73,8 @@
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_zen4_override_gemm_blkszs (cntx_t* cntx);
|
||||
|
||||
/*
|
||||
* Restore the block sizes to default values needed for zen4 context.
|
||||
*
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -73,6 +73,7 @@ err_t bli_gemmsup
|
||||
trans_t transa = bli_obj_conjtrans_status( a );
|
||||
trans_t transb = bli_obj_conjtrans_status( b );
|
||||
|
||||
|
||||
//Don't use sup for currently unsupported storage types in cgemmsup
|
||||
if(bli_obj_is_scomplex(c) &&
|
||||
(((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
|
||||
@@ -95,17 +96,49 @@ err_t bli_gemmsup
|
||||
}
|
||||
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// Obtain a valid context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Creating a local copy of cntx inorder to store overrided blocksizes
|
||||
// and kernel fucntion pointers.
|
||||
// This can be removed when we use same blocksizes and function pointers
|
||||
// for all level-3 SUP routines.
|
||||
cntx_t cntx_gemm = *cntx;
|
||||
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
|
||||
|
||||
if((bli_arch_query_id() == BLIS_ARCH_ZEN4) && (bli_obj_dt(a) == BLIS_DOUBLE))
|
||||
{
|
||||
// This check will be removed once transpose and store of C matrix inside
|
||||
// the kernel is supported.
|
||||
if((stor_id == BLIS_RCC || stor_id == BLIS_CRR || stor_id == BLIS_RRC))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for dgemm.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
// override the existing blocksizes with 24x8 specific ones.
|
||||
// This can be removed when we use same blocksizes and function pointers
|
||||
// for all level-3 SUP routines.
|
||||
bli_zen4_override_gemm_blkszs(&cntx_gemm);
|
||||
|
||||
// Pack A to avoid RD kernels.
|
||||
if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
|
||||
{
|
||||
bli_rntm_set_pack_a(1, rntm);//packa
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
// Calculating optimal nt and corresponding factorization (ic,jc) here, so
|
||||
// as to determine the matrix dimensions (A - m, B - n) per thread. This
|
||||
@@ -158,7 +191,7 @@ err_t bli_gemmsup
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
&cntx_gemm,
|
||||
rntm
|
||||
);
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -66,6 +66,16 @@ err_t bli_gemmsup_int
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
const bool col_pref = !row_pref;
|
||||
|
||||
// For row-preferred kernels, rrr_rrc_rcr_crr becomes primary
|
||||
// For col-preferred kernels, rcc_crc_ccr_ccc becomes primary
|
||||
const bool is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) ||
|
||||
( col_pref && is_rcc_crc_ccr_ccc );
|
||||
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
@@ -78,12 +88,11 @@ err_t bli_gemmsup_int
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
if ( is_rrr_rrc_rcr_crr )
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
|
||||
// calculate number of micropanels in m and n dimensions and
|
||||
// recalculate the automatic thread factorization based on these number of micropanels
|
||||
@@ -164,7 +173,6 @@ err_t bli_gemmsup_int
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -198,9 +198,28 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
|
||||
}
|
||||
else
|
||||
{
|
||||
//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" );
|
||||
bli_abort();
|
||||
if ( dt == BLIS_DOUBLE )
|
||||
{
|
||||
// The optimizations are only done for CRC and RRC storage schemes to avoid RD kernels.
|
||||
// Optimizations for other storage schemes is yet to be done.
|
||||
if ( packa )
|
||||
{
|
||||
if( *eff_id == BLIS_CRC )
|
||||
{
|
||||
*eff_id = BLIS_CCC;
|
||||
}
|
||||
else if ( *eff_id == BLIS_RRC )
|
||||
{
|
||||
*trans = bli_trans_toggled( *trans );
|
||||
*eff_id = BLIS_RCC;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels for S, C and Z datatypes.\n" );
|
||||
bli_abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,4 +11,4 @@ target_sources("${PROJECT_NAME}"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
|
||||
)
|
||||
|
||||
add_subdirectory(sup)
|
||||
add_subdirectory(sup)
|
||||
|
||||
@@ -10,4 +10,7 @@ target_sources("${PROJECT_NAME}"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c
|
||||
)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c
|
||||
)
|
||||
|
||||
add_subdirectory(d24x8)
|
||||
|
||||
8845
kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c
Normal file
8845
kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c
Normal file
File diff suppressed because it is too large
Load Diff
13
kernels/zen4/3/sup/d24x8/CMakeLists.txt
Normal file
13
kernels/zen4/3/sup/d24x8/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx4.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx5.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c
|
||||
)
|
||||
1380
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c
Normal file
1380
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c
Normal file
File diff suppressed because it is too large
Load Diff
1650
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c
Normal file
1650
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c
Normal file
File diff suppressed because it is too large
Load Diff
1920
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c
Normal file
1920
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c
Normal file
File diff suppressed because it is too large
Load Diff
2196
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c
Normal file
2196
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c
Normal file
File diff suppressed because it is too large
Load Diff
2571
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c
Normal file
2571
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c
Normal file
File diff suppressed because it is too large
Load Diff
2841
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c
Normal file
2841
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c
Normal file
File diff suppressed because it is too large
Load Diff
3112
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c
Normal file
3112
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c
Normal file
File diff suppressed because it is too large
Load Diff
3383
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c
Normal file
3383
kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -118,4 +118,47 @@ TRSMSMALL_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 )
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
TRSMSMALL_PROT(trsm_small_mt_AVX512)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Dgemm sup RV kernels
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2m)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1m)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x8)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x7)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x7)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x6)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x6)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x5)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x5)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x4)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x4)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x3)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x3)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x2)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x2)
|
||||
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user