Added DTRSM Small Path AVX512 based LUNN/LLTN Variant Kernels

- 8x8 kernels are used for DTRSM SMALL
- Matrix A(a10) is packed for GEMM operations.
- Packed martix A will be re-used in all the col-block
  along N-dimension.
- Diagonal elements of A matrix are packed(a11) for
  TRSM operations.
- Implemented fringe cases with below block sizes
   8x8, 8x4, 8x3, 8x2, 8x1
   4x8, 4x4, 4x3, 4x2, 4x1
   3x8, 3x4, 3x3, 3x2, 3x1
   2x8, 2x4, 2x3, 2x2, 2x1
   1x8, 1x4, 1x3, 1x2, 1x1

AMD-Internal: [CPUPL-2745]

Change-Id: I5bb57501f6d3783eb654e375d63901467dd14734
This commit is contained in:
Aayush Kumar
2023-04-12 04:38:15 +00:00
parent b515643c54
commit 6ad387c2aa
2 changed files with 1794 additions and 12 deletions

View File

@@ -1059,9 +1059,9 @@ void dtrsm_blis_impl
{
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// this is a temporary fix, will be removed when all variants are added
if( ((blis_side == BLIS_RIGHT) && ((n0 > 300) && (m0 > 50))) ||
((blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_NO_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_TRANSPOSE) ) ) && ((n0 != 30 && n0 !=60 ) && (m0 > 50))) )
/* For sizes where m and n < 50,avx2 kernels are performing better,
except for sizes where n is multiple of 8.*/
if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
{
ker_ft = bli_trsm_small_AVX512;
}
@@ -1088,14 +1088,7 @@ void dtrsm_blis_impl
{
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
if ( (blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_NO_TRANSPOSE) ) ))
{
ker_ft = bli_trsm_small_mt;
}
else
{
ker_ft = bli_trsm_small_mt_AVX512;
}
ker_ft = bli_trsm_small_mt_AVX512;
break;
#endif// BLIS_KERNELS_ZEN4
case BLIS_ARCH_ZEN:

File diff suppressed because it is too large Load Diff