Added AVX512 DTRSM small RLNN/RUTN variant kernels

- 8x8 kernels are used for DTRSM SMALL
- Implemented fringe cases with below block sizes
   8x8, 8x4, 8x3, 8x2, 8x1
   4x8, 4x4, 4x3, 4x2, 4x1
   3x8, 3x4, 3x3, 3x2, 3x1
   2x8, 2x4, 2x3, 2x2, 2x1
   1x8, 1x4, 1x3, 1x2, 1x1

AMD-Internal: [CPUPL-2745]

Change-Id: Ifb8cfba6958e1c89ddbfa18893127ab6d44cc367
This commit is contained in:
Shubham
2023-01-17 23:04:27 +05:30
parent 1f2447f800
commit b84157fed6
2 changed files with 2309 additions and 117 deletions

View File

@@ -956,9 +956,6 @@ void dtrsm_blis_impl
// Query the architecture ID
arch_t id = bli_arch_query_id();
#if defined(BLIS_KERNELS_ZEN4)
bool uplo, transa;
#endif
switch(id)
{
case BLIS_ARCH_ZEN4:
@@ -969,11 +966,8 @@ void dtrsm_blis_impl
// for n < 200 avx2 kernels are performing better, but if
// n is a multiple of 8 then there will be no fringe case for avx512,
// in such cases avx512 kernels will perform better.
uplo = bli_obj_is_upper(&ao);
transa = bli_obj_has_trans(&ao);
if(( ((blis_side == BLIS_RIGHT) && (uplo == true) && (transa == false)) ||
((blis_side == BLIS_RIGHT) && (uplo == false) && (transa == true))) &&
((n0 > 400) && (m0 > 50)))
if( (blis_side == BLIS_RIGHT) &&
((n0 > 300) && (m0 > 50)))
{
status = bli_trsm_small_AVX512(
blis_side,

File diff suppressed because it is too large Load Diff