Add AVX2 path for TRSM+GEMM combination.

- Enabled AVX2 TRSM + GEMM kernel path, when GEMM is called
  from TRSM context it will invoke AVX2 GEMM kernels instead
  of the default AVX-512 GEMM kernels.

- The default context has the block sizes for AVX512 GEMM
  kernels, however, TRSM uses AVX2 GEMM kernels and they
  need different block sizes.

- Added new API bli_zen4_override_trsm_blkszs(). It overrides
  default block sizes in context with block sizes needed for
  AVX2 GEMM kernels.

- Added new API bli_zen4_restore_default_blkszs(). It restores
  The block sizes to there default values (as needed by default
   AVX512 GEMM kernels).

- Updated bli_trsm_front() to override the block sizes in the
  context needed by TRSM + AVX2 GEMM kernels and restore them
  to the default values at the end of this function. It is done
  in bli_trsm_front() so that we override the context before
  creating different threads.

AMD-Internal: [CPUPL-2225]
Change-Id: Ie92d0fc40f94a32dfb865fe3771dc14ed7884c55
This commit is contained in:
Dipal M Zambare
2022-05-18 11:01:41 +05:30
committed by Dipal M. Zambare
parent d4bb906094
commit 2ba2fb2b63
12 changed files with 311 additions and 34 deletions

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -61,5 +61,29 @@
//#define BLIS_ENABLE_FAST_MATH
/*
* Override the block sizes in the context to the block sizes used
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
* GEMM kernels are AVX512 based and uses different block sizes.
*
* This function should be called in TRSM path before performing
* any packing operations.
*
* Also the context must be restored to default values by calling
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
*/
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
/*
* Restore the block sizes to default values needed for zen4 context.
*
* This function should be called to restore the block sizes to there
* default values if they where overriden by calling
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
* TRSM path.
*
*/
BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
#endif

View File

@@ -34,6 +34,24 @@
#include "blis.h"
/*
* List of default block sizes for zen4.
* Converted it to macro as this list is used at multiple places in this file.
*/
#define BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs) \
/* s d c z */ \
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); \
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 ); \
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 ); \
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566, \
480, 320, 256, 566 ); \
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 ); \
\
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); \
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \
void bli_cntx_init_zen4( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -47,20 +65,23 @@ void bli_cntx_init_zen4( cntx_t* cntx )
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
10,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
#if 0 // GENOA TODO: TRSM AVX-512 implementation
BLIS_GEMM_AVX2_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_AVX2_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
#endif
cntx
);
@@ -115,7 +136,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
24,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
// axpbyv
@@ -162,17 +183,8 @@ void bli_cntx_init_zen4( cntx_t* cntx )
//
// These are reference block sizes and may be overridden based on
// number of threads used at runtime.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566,
480, 320, 256, 566 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
@@ -192,11 +204,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
);
// -------------------------------------------------------------------------
#if 0 // GENOA TODO: TRSM AVX-512 implementation
#if 0 // Replaced with runtime blocksize override
//Initialize TRSM blocksize objects with architecture-specific values.
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
//Tuning is done for double-precision only.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
@@ -298,3 +313,72 @@ void bli_cntx_init_zen4( cntx_t* cntx )
cntx
);
}
/*
* Override the block sizes in the context to the block sizes used
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
* GEMM kernels are AVX512 based and uses different block sizes.
*
* This function should be called in TRSM path before performing
* any packing operations.
*
* Also the context must be restored to default values by calling
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
*/
void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
/*
* Restore the block sizes to default values needed for zen4 context.
*
* This function should be called to restore the block sizes to there
* default values if they where overriden by calling
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
* TRSM path.
*
*/
void bli_zen4_restore_default_blkszs (cntx_t* cntx)
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
}

View File

@@ -62,4 +62,28 @@
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
/*
* Override the block sizes in the context to the block sizes used
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
* GEMM kernels are AVX512 based and uses different block sizes.
*
* This function should be called in TRSM path before performing
* any packing operations.
*
* Also the context must be restored to default values by calling
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
*/
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
/*
* Restore the block sizes to default values needed for zen4 context.
*
* This function should be called to restore the block sizes to there
* default values if they where overriden by calling
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
* TRSM path.
*
*/
BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
#endif

View File

@@ -73,11 +73,11 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
# gcc or clang version must be atleast 4.0
# gcc 9.0 or later:
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
CRVECFLAGS += -march=znver3
else
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
CRVECFLAGS += -march=znver2
else
# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
@@ -107,12 +107,12 @@ CRVECFLAGS += -march=znver4
else
# for version 3x we will enable znver3
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
CRVECFLAGS += -march=znver3
else
# for version 2x we will enable znver2
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
CRVECFLAGS += -march=znver2
else
#if compiling with clang

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -36,6 +36,7 @@
#include "blis.h"
//#define PRINT_SMALL_TRSM_INFO
void bli_trsm_front
(
side_t side,
@@ -151,6 +152,24 @@ void bli_trsm_front
// in bli_packm_init().
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
*
* The AVX2 kernels use different block sizes then AVX512 kernels
* Here we override the default block sizes in the context with AVX2
* specific block size used in GEMMTRSM kernerls.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/
if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) &&
(bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) )
{
bli_zen4_override_trsm_blkszs(cntx);
}
#endif
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
@@ -177,6 +196,20 @@ void bli_trsm_front
rntm,
cntl
);
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
/* Zen4 TRSM Fixme:
*
* We have overrding the block sizes at the start of this function
* Since the context is created only once we need to ensure that the
* default block sizes are restored for the subsequent operations.
*/
if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) &&
(bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) )
{
bli_zen4_restore_default_blkszs(cntx);
}
#endif
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.
*
* Here we force the GEMM kernels to the AVX2 varients for float and double.
* For scomplex and dcomplex reference path is retained as is.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/ \
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
{ \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
} \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.
*
* Here we force the GEMM kernels to the AVX2 varients for float and double.
* For scomplex and dcomplex reference path is retained as is.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/ \
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
{ \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
} \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -180,6 +180,25 @@ void PASTEMAC(ch,varname) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.
*
* Here we force the GEMM kernels to the AVX2 varients for float and double.
* For scomplex and dcomplex reference path is retained as is.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/ \
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
{ \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
} \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -179,6 +179,25 @@ void PASTEMAC(ch,varname) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.
*
* Here we force the GEMM kernels to the AVX2 varients for float and double.
* For scomplex and dcomplex reference path is retained as is.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/ \
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
{ \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
} \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -272,7 +272,7 @@ dim_t bli_determine_blocksize_f
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
// If b_use != 0, this means that trsm blocksizes are set
// If b_alg != 0, this means that trsm blocksizes are set
// and we continue with trsm-specific blocksizes.
// Else, we query L3 blocksizes and use them for TRSM execution.
if( b_alg > 0 ) return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max);
@@ -313,10 +313,10 @@ dim_t bli_determine_blocksize_b
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
// If b_use != 0, this means that trsm blocksizes are set
// If b_alg != 0, this means that trsm blocksizes are set
// and we continue with trsm-specific blocksizes.
// Else, we query L3 blocksizes and use them for TRSM execution.
if( b_alg > 0 ) bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
if( b_alg > 0 ) return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
}

View File

@@ -802,10 +802,11 @@ typedef enum
BLIS_GEMMTRSM_L_UKR,
BLIS_GEMMTRSM_U_UKR,
BLIS_TRSM_L_UKR,
BLIS_TRSM_U_UKR
BLIS_TRSM_U_UKR,
BLIS_GEMM_AVX2_UKR
} l3ukr_t;
#define BLIS_NUM_LEVEL3_UKRS 5
#define BLIS_NUM_LEVEL3_UKRS 6
typedef enum

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -209,13 +209,32 @@ void libblis_test_gemmtrsm_ukr_experiment
// Query a context.
cntx = bli_gks_query_cntx();
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
/* Zen4 TRSM Fixme:
*
* TRSM and GEMM used different values of MR and NR, we need to ensure that
* Values used for packing are as per the MR and NR values expected by the kernels
* For now this issue exists only for zen4 hence override the values here if
* the family is BLIS_TRSM and architecture is zen4
*
* We need to override the values here as well as the packing and compute
* kernels are invoked directly from here (instead of BLIS/BLAS call.)
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/
if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
{
bli_zen4_override_trsm_blkszs(cntx);
}
#endif
// Use the datatype of the first char in the datatype combination string.
bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
// Map the dimension specifier to actual dimensions.
k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
// Fix m and n to MR and NR, respectively.
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
@@ -224,6 +243,7 @@ void libblis_test_gemmtrsm_ukr_experiment
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
// Store the register blocksizes so that the driver can retrieve the
// values later when printing results.
op->dim_aux[0] = m;
@@ -433,6 +453,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
#endif
// Free the packed objects.
bli_obj_free( &ap );
bli_obj_free( &bp );
@@ -442,6 +463,20 @@ bli_printm( "ap", &ap, "%5.2f", "" );
bli_obj_free( &b );
bli_obj_free( &c11 );
bli_obj_free( &c11_save );
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
/* Zen4 TRSM Fixme:
*
* We have overrding the block sizes at the start of this function
* Since the context is created only once we need to ensure that the
* default block sizes are restored for the subsequent operations.
*/
if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
{
bli_zen4_restore_default_blkszs(cntx);
}
#endif
}