mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 14:29:05 +00:00
This PR introduces support for double buffering in LDS into GEMM kernels that use direct load instructions. Direct loads now use inline asm instead of intrinsics. Usage of intrinsics results in compiler adding additional waitcnt instructions what breaks possible load/compute overlap in case of double buffering. Usage of inline asm results in the need to use sched_barrier in order to make sure that compiler cannot incorrectly reschedule instructions since it does not know the data dependencies between global->LDS and LDS->registers.
255 lines
8.6 KiB
C++
255 lines
8.6 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/config.h"
|
|
|
|
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
|
|
#include "hip/hip_runtime.h"
|
|
#include "hip/hip_fp16.h"
|
|
#endif
|
|
|
|
#define CK_TIME_KERNEL 1
|
|
|
|
// constant address space for kernel parameter
|
|
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
|
|
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
|
|
|
|
// launch bounds
|
|
#define CK_USE_LAUNCH_BOUNDS 1
|
|
|
|
#ifdef CK_USE_LAUNCH_BOUNDS
|
|
// for most kernels
|
|
#define CK_MAX_THREAD_PER_BLOCK 256
|
|
#define CK_MIN_BLOCK_PER_CU 2
|
|
|
|
// for wavelet GEMM kernel
|
|
#define CK_WAVELET_MAX_THREAD_PER_BLOCK 512
|
|
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
|
|
#endif
|
|
|
|
// kernel attribute: amdgpu_waves_per_eu()
|
|
#ifdef CK_USE_WAVES_PER_EU
|
|
// for 1-wave kernels, control arguments of amdgpu_waves_per_eu() attribute
|
|
#ifndef CK_MIN_WAVES_PER_EU
|
|
#define CK_MIN_WAVES_PER_EU 0
|
|
#endif
|
|
|
|
#ifndef CK_MAX_WAVES_PER_EU
|
|
#define CK_MAX_WAVES_PER_EU 0
|
|
#endif
|
|
|
|
#else
|
|
#define CK_USE_WAVES_PER_EU 0
|
|
#endif
|
|
|
|
// buffer resource
|
|
#ifndef __HIP_DEVICE_COMPILE__ // for host code
|
|
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
|
|
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
|
|
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
|
|
defined(__gfx942__) // for GPU code
|
|
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
|
|
#elif defined(__gfx1030__) // for GPU code
|
|
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
|
|
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
|
|
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
|
|
#endif
|
|
|
|
// FMA instruction
|
|
#ifndef __HIP_DEVICE_COMPILE__ // for host code, define nothing
|
|
#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
|
|
#define CK_USE_AMD_V_MAC_F32
|
|
#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
|
|
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // for GPU code
|
|
#define CK_USE_AMD_V_FMAC_F32
|
|
#define CK_USE_AMD_V_DOT2_F32_F16
|
|
#define CK_USE_AMD_V_DOT4_I32_I8
|
|
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
|
|
#define CK_USE_AMD_V_FMAC_F32
|
|
#define CK_USE_AMD_V_DOT2_F32_F16
|
|
#define CK_USE_AMD_V_DOT4_I32_I8_GFX11
|
|
#endif
|
|
|
|
// MFMA instruction
|
|
#ifndef __HIP_DEVICE_COMPILE__ // for host code
|
|
#define CK_USE_AMD_MFMA
|
|
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
|
|
defined(__gfx942__) // for GPU code
|
|
#define CK_USE_AMD_MFMA
|
|
#endif
|
|
|
|
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
|
#define CK_USE_AMD_MFMA_BF16_1K_OP
|
|
#endif
|
|
|
|
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
|
#define CK_USE_AMD_MFMA_GFX940
|
|
#endif
|
|
|
|
// WMMA instruction
|
|
#ifndef __HIP_DEVICE_COMPILE__ // for host code
|
|
#define CK_USE_AMD_WMMA
|
|
#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
|
|
#define CK_USE_AMD_WMMA
|
|
#endif
|
|
|
|
// buffer load
|
|
#define CK_USE_AMD_BUFFER_LOAD 1
|
|
|
|
// buffer store
|
|
#define CK_USE_AMD_BUFFER_STORE 1
|
|
|
|
// buffer atomic add: integer
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
|
|
|
|
// buffer atomic add: floating point
|
|
#ifndef __HIP_DEVICE_COMPILE__ // for host code
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
|
|
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
|
|
defined(__gfx942__) // for GPU code
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
|
|
#else // for GPU code
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
|
|
#endif
|
|
|
|
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
|
|
defined(__gfx942__)) // for GPU code
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
|
|
#else
|
|
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
|
|
#endif
|
|
|
|
// inline asm
|
|
#define CK_USE_AMD_INLINE_ASM 1
|
|
|
|
// inner product (V_MAC/V_FMAC)
|
|
#define CK_USE_AMD_V_MAC_INLINE_ASM 1
|
|
|
|
// V_DOT inline instructions, less efficient since they require adding
|
|
// `s_nop`s to avoid hazard
|
|
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
|
|
|
|
// inner product using V_DOT with DPP8 modifiers
|
|
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
|
|
|
|
// LDS direct loads using inline assembly
|
|
#define CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
|
|
|
|
// set stochastic rounding as default for f8 conversions
|
|
#define CK_USE_SR_F8_CONVERSION 1
|
|
|
|
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
|
|
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
|
|
|
|
// experimental feature: multi index implemented as array
|
|
#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
|
|
|
|
// experimental feature: static tensor descriptor
|
|
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
|
|
|
|
// experimental feature: buffer load/store/atomic-add/ OOB trick
|
|
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
|
// setting. Don't use this hack unless absolutely necessary!
|
|
// FIXME: make the behavior of buffer load a configurable (template) parameter for each usage
|
|
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
|
|
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
|
|
#endif
|
|
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
|
|
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
|
|
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
|
|
|
|
// experimental feature: in-regsiter sub-dword transpose
|
|
#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
|
|
|
|
// experimental feature: merge transformation use magic number division
|
|
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
|
|
|
|
// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
|
|
// pointer of scalar
|
|
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
|
|
|
|
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
|
|
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
|
|
|
|
// experimental feature: optimize for inter-wave scheduling policy
|
|
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1
|
|
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
|
|
// this will let make_default_loop_scheduler() return interwave scheduling flag by default
|
|
#define CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING 0
|
|
// experimental feature: add instances using interwave scheduling
|
|
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
|
|
// experimental feature: add instances using pipeline v2
|
|
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
|
|
// experimental feature: optimize pipeline v2 by IGLP strategy (value=ID of strategy)
|
|
#ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
|
|
#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0
|
|
#endif
|
|
|
|
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
|
|
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
|
|
// thread-invariant, otherwise it's a bug
|
|
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
|
|
#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
|
|
|
|
// workaround: compiler crash when compiling recursive lambda
|
|
#define CK_WORKAROUND_SWDEV_275126 1
|
|
|
|
// workaround: compiler crash when using buffer load/store for i8
|
|
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
|
|
|
|
// workaround: compiler gnerating inefficient ds_write instructions
|
|
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
|
|
|
|
// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
|
|
// tuning parameter
|
|
#define CK_WORKAROUND_SWDEV_325164 0
|
|
|
|
// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
|
|
#define CK_WORKAROUND_SWDEV_383542 1
|
|
|
|
// workaround: compiler issue on gfx908
|
|
#define CK_WORKAROUND_SWDEV_388832 1
|
|
|
|
// flag to enable (1) or disable (0) the debugging output in some kernels
|
|
#define DEBUG_LOG 0
|
|
|
|
// denorm test fix, required to work around dissue
|
|
#ifndef CK_WORKAROUND_DENORM_FIX
|
|
#define CK_WORKAROUND_DENORM_FIX 0
|
|
#elif
|
|
// enable only on MI200
|
|
#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
|
|
#endif // CK_WORKAROUND_DENORM_FIX
|
|
|
|
namespace ck {
|
|
|
|
enum struct InMemoryDataOperationEnum
|
|
{
|
|
Set,
|
|
AtomicAdd,
|
|
AtomicMax,
|
|
Add
|
|
};
|
|
|
|
// FIXME: use regular Sequence and remove this
|
|
template <InMemoryDataOperationEnum... Is>
|
|
struct InMemoryDataOperationEnumSequence
|
|
{
|
|
static constexpr int mSize = sizeof...(Is);
|
|
|
|
__host__ __device__ static constexpr InMemoryDataOperationEnum At(int I)
|
|
{
|
|
// the last dummy element is to prevent compiler complain about empty array, when mSize = 0
|
|
const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set};
|
|
return mData[I];
|
|
}
|
|
};
|
|
|
|
// index type
|
|
using index_t = int32_t;
|
|
using long_index_t = int64_t;
|
|
|
|
} // namespace ck
|