mirror of
https://github.com/amd/blis.git
synced 2026-05-13 18:52:14 +00:00
Added DTRSM Small Path AVX512 based LUNN/LLTN Variant Kernels
- 8x8 kernels are used for DTRSM SMALL - Matrix A(a10) is packed for GEMM operations. - Packed martix A will be re-used in all the col-block along N-dimension. - Diagonal elements of A matrix are packed(a11) for TRSM operations. - Implemented fringe cases with below block sizes 8x8, 8x4, 8x3, 8x2, 8x1 4x8, 4x4, 4x3, 4x2, 4x1 3x8, 3x4, 3x3, 3x2, 3x1 2x8, 2x4, 2x3, 2x2, 2x1 1x8, 1x4, 1x3, 1x2, 1x1 AMD-Internal: [CPUPL-2745] Change-Id: I5bb57501f6d3783eb654e375d63901467dd14734
This commit is contained in:
@@ -1059,9 +1059,9 @@ void dtrsm_blis_impl
|
||||
{
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// this is a temporary fix, will be removed when all variants are added
|
||||
if( ((blis_side == BLIS_RIGHT) && ((n0 > 300) && (m0 > 50))) ||
|
||||
((blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_NO_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_TRANSPOSE) ) ) && ((n0 != 30 && n0 !=60 ) && (m0 > 50))) )
|
||||
/* For sizes where m and n < 50,avx2 kernels are performing better,
|
||||
except for sizes where n is multiple of 8.*/
|
||||
if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
}
|
||||
@@ -1088,14 +1088,7 @@ void dtrsm_blis_impl
|
||||
{
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
if ( (blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_NO_TRANSPOSE) ) ))
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt;
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_AVX512;
|
||||
}
|
||||
ker_ft = bli_trsm_small_mt_AVX512;
|
||||
break;
|
||||
#endif// BLIS_KERNELS_ZEN4
|
||||
case BLIS_ARCH_ZEN:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user