mirror of
https://github.com/amd/blis.git
synced 2026-06-06 04:34:02 +00:00
Add AVX2 path for TRSM+GEMM combination.
- Enabled AVX2 TRSM + GEMM kernel path, when GEMM is called from TRSM context it will invoke AVX2 GEMM kernels instead of the default AVX-512 GEMM kernels. - The default context has the block sizes for AVX512 GEMM kernels, however, TRSM uses AVX2 GEMM kernels and they need different block sizes. - Added new API bli_zen4_override_trsm_blkszs(). It overrides default block sizes in context with block sizes needed for AVX2 GEMM kernels. - Added new API bli_zen4_restore_default_blkszs(). It restores The block sizes to there default values (as needed by default AVX512 GEMM kernels). - Updated bli_trsm_front() to override the block sizes in the context needed by TRSM + AVX2 GEMM kernels and restore them to the default values at the end of this function. It is done in bli_trsm_front() so that we override the context before creating different threads. AMD-Internal: [CPUPL-2225] Change-Id: Ie92d0fc40f94a32dfb865fe3771dc14ed7884c55
This commit is contained in:
committed by
Dipal M. Zambare
parent
d4bb906094
commit
2ba2fb2b63
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -61,5 +61,29 @@
|
||||
|
||||
//#define BLIS_ENABLE_FAST_MATH
|
||||
|
||||
/*
|
||||
* Override the block sizes in the context to the block sizes used
|
||||
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
|
||||
* GEMM kernels are AVX512 based and uses different block sizes.
|
||||
*
|
||||
* This function should be called in TRSM path before performing
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
|
||||
|
||||
/*
|
||||
* Restore the block sizes to default values needed for zen4 context.
|
||||
*
|
||||
* This function should be called to restore the block sizes to there
|
||||
* default values if they where overriden by calling
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* TRSM path.
|
||||
*
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -34,6 +34,24 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
* List of default block sizes for zen4.
|
||||
* Converted it to macro as this list is used at multiple places in this file.
|
||||
*/
|
||||
|
||||
#define BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs) \
|
||||
/* s d c z */ \
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); \
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 ); \
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 ); \
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566, \
|
||||
480, 320, 256, 566 ); \
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 ); \
|
||||
\
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); \
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \
|
||||
|
||||
|
||||
void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
@@ -47,20 +65,23 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
4,
|
||||
10,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
#if 0 // GENOA TODO: TRSM AVX-512 implementation
|
||||
|
||||
BLIS_GEMM_AVX2_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_AVX2_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
#endif
|
||||
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -115,7 +136,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
24,
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
|
||||
|
||||
// axpbyv
|
||||
@@ -162,17 +183,8 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
//
|
||||
// These are reference block sizes and may be overridden based on
|
||||
// number of threads used at runtime.
|
||||
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566,
|
||||
480, 320, 256, 566 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 );
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
@@ -192,11 +204,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
);
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#if 0 // GENOA TODO: TRSM AVX-512 implementation
|
||||
#if 0 // Replaced with runtime blocksize override
|
||||
|
||||
//Initialize TRSM blocksize objects with architecture-specific values.
|
||||
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
//Tuning is done for double-precision only.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
|
||||
@@ -298,3 +313,72 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Override the block sizes in the context to the block sizes used
|
||||
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
|
||||
* GEMM kernels are AVX512 based and uses different block sizes.
|
||||
*
|
||||
* This function should be called in TRSM path before performing
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
|
||||
*/
|
||||
void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
|
||||
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Restore the block sizes to default values needed for zen4 context.
|
||||
*
|
||||
* This function should be called to restore the block sizes to there
|
||||
* default values if they where overriden by calling
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* TRSM path.
|
||||
*
|
||||
*/
|
||||
void bli_zen4_restore_default_blkszs (cntx_t* cntx)
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
@@ -62,4 +62,28 @@
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
/*
|
||||
* Override the block sizes in the context to the block sizes used
|
||||
* by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
|
||||
* GEMM kernels are AVX512 based and uses different block sizes.
|
||||
*
|
||||
* This function should be called in TRSM path before performing
|
||||
* any packing operations.
|
||||
*
|
||||
* Also the context must be restored to default values by calling
|
||||
* bli_zen4_restore_default_blkszs() before exiting TRSM Path
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
|
||||
|
||||
/*
|
||||
* Restore the block sizes to default values needed for zen4 context.
|
||||
*
|
||||
* This function should be called to restore the block sizes to there
|
||||
* default values if they where overriden by calling
|
||||
* bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
|
||||
* TRSM path.
|
||||
*
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -73,11 +73,11 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
|
||||
# gcc or clang version must be atleast 4.0
|
||||
# gcc 9.0 or later:
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver3
|
||||
else
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver2
|
||||
else
|
||||
# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
|
||||
@@ -107,12 +107,12 @@ CRVECFLAGS += -march=znver4
|
||||
else
|
||||
# for version 3x we will enable znver3
|
||||
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver3
|
||||
else
|
||||
# for version 2x we will enable znver2
|
||||
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver2
|
||||
else
|
||||
#if compiling with clang
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -36,6 +36,7 @@
|
||||
#include "blis.h"
|
||||
//#define PRINT_SMALL_TRSM_INFO
|
||||
|
||||
|
||||
void bli_trsm_front
|
||||
(
|
||||
side_t side,
|
||||
@@ -151,6 +152,24 @@ void bli_trsm_front
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels use different block sizes then AVX512 kernels
|
||||
* Here we override the default block sizes in the context with AVX2
|
||||
* specific block size used in GEMMTRSM kernerls.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/
|
||||
if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) &&
|
||||
(bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) )
|
||||
{
|
||||
bli_zen4_override_trsm_blkszs(cntx);
|
||||
}
|
||||
#endif
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
@@ -177,6 +196,20 @@ void bli_trsm_front
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
|
||||
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* We have overrding the block sizes at the start of this function
|
||||
* Since the context is created only once we need to ensure that the
|
||||
* default block sizes are restored for the subsequent operations.
|
||||
*/
|
||||
if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) &&
|
||||
(bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) )
|
||||
{
|
||||
bli_zen4_restore_default_blkszs(cntx);
|
||||
}
|
||||
#endif
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
*
|
||||
* Here we force the GEMM kernels to the AVX2 varients for float and double.
|
||||
* For scomplex and dcomplex reference path is retained as is.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/ \
|
||||
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
|
||||
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
|
||||
{ \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
|
||||
} \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
*
|
||||
* Here we force the GEMM kernels to the AVX2 varients for float and double.
|
||||
* For scomplex and dcomplex reference path is retained as is.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/ \
|
||||
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
|
||||
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
|
||||
{ \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
|
||||
} \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -180,6 +180,25 @@ void PASTEMAC(ch,varname) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
*
|
||||
* Here we force the GEMM kernels to the AVX2 varients for float and double.
|
||||
* For scomplex and dcomplex reference path is retained as is.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/ \
|
||||
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
|
||||
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
|
||||
{ \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
|
||||
} \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -179,6 +179,25 @@ void PASTEMAC(ch,varname) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
*
|
||||
* Here we force the GEMM kernels to the AVX2 varients for float and double.
|
||||
* For scomplex and dcomplex reference path is retained as is.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/ \
|
||||
if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
|
||||
(dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \
|
||||
{ \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
|
||||
} \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -272,7 +272,7 @@ dim_t bli_determine_blocksize_f
|
||||
b_alg = bli_blksz_get_def( dt, bsize );
|
||||
b_max = bli_blksz_get_max( dt, bsize );
|
||||
|
||||
// If b_use != 0, this means that trsm blocksizes are set
|
||||
// If b_alg != 0, this means that trsm blocksizes are set
|
||||
// and we continue with trsm-specific blocksizes.
|
||||
// Else, we query L3 blocksizes and use them for TRSM execution.
|
||||
if( b_alg > 0 ) return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max);
|
||||
@@ -313,10 +313,10 @@ dim_t bli_determine_blocksize_b
|
||||
b_alg = bli_blksz_get_def( dt, bsize );
|
||||
b_max = bli_blksz_get_max( dt, bsize );
|
||||
|
||||
// If b_use != 0, this means that trsm blocksizes are set
|
||||
// If b_alg != 0, this means that trsm blocksizes are set
|
||||
// and we continue with trsm-specific blocksizes.
|
||||
// Else, we query L3 blocksizes and use them for TRSM execution.
|
||||
if( b_alg > 0 ) bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
|
||||
if( b_alg > 0 ) return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -802,10 +802,11 @@ typedef enum
|
||||
BLIS_GEMMTRSM_L_UKR,
|
||||
BLIS_GEMMTRSM_U_UKR,
|
||||
BLIS_TRSM_L_UKR,
|
||||
BLIS_TRSM_U_UKR
|
||||
BLIS_TRSM_U_UKR,
|
||||
BLIS_GEMM_AVX2_UKR
|
||||
} l3ukr_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL3_UKRS 5
|
||||
#define BLIS_NUM_LEVEL3_UKRS 6
|
||||
|
||||
|
||||
typedef enum
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -209,13 +209,32 @@ void libblis_test_gemmtrsm_ukr_experiment
|
||||
// Query a context.
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* TRSM and GEMM used different values of MR and NR, we need to ensure that
|
||||
* Values used for packing are as per the MR and NR values expected by the kernels
|
||||
* For now this issue exists only for zen4 hence override the values here if
|
||||
* the family is BLIS_TRSM and architecture is zen4
|
||||
*
|
||||
* We need to override the values here as well as the packing and compute
|
||||
* kernels are invoked directly from here (instead of BLIS/BLAS call.)
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/
|
||||
if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
|
||||
{
|
||||
bli_zen4_override_trsm_blkszs(cntx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Use the datatype of the first char in the datatype combination string.
|
||||
bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
|
||||
|
||||
// Map the dimension specifier to actual dimensions.
|
||||
k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
|
||||
|
||||
// Fix m and n to MR and NR, respectively.
|
||||
|
||||
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
|
||||
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
@@ -224,6 +243,7 @@ void libblis_test_gemmtrsm_ukr_experiment
|
||||
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
|
||||
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
|
||||
// Store the register blocksizes so that the driver can retrieve the
|
||||
// values later when printing results.
|
||||
op->dim_aux[0] = m;
|
||||
@@ -433,6 +453,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
#endif
|
||||
|
||||
|
||||
// Free the packed objects.
|
||||
bli_obj_free( &ap );
|
||||
bli_obj_free( &bp );
|
||||
@@ -442,6 +463,20 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c11 );
|
||||
bli_obj_free( &c11_save );
|
||||
|
||||
#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4)
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* We have overrding the block sizes at the start of this function
|
||||
* Since the context is created only once we need to ensure that the
|
||||
* default block sizes are restored for the subsequent operations.
|
||||
*/
|
||||
if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
|
||||
{
|
||||
bli_zen4_restore_default_blkszs(cntx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user