[ck] add gridwise base class for in all xdl kernel (#186) (#3544)

1. Add base class GridwiseGemm_xdl_cshuffle_base for all gridwise_gemm_xdl classes.
- to select correct LDS layout and epilogue behavior , three additional parameters is added.
- ForceNaiveLdsLayout: disable XOR based LDS layout when it is true
- DirectLoad: pipeline only use directload, we need force naive layout and ignore any padding on gfx9
- IsMxGemm: epilogue has two addtional dimensions
2. Move all LDS descriptor layout related fucntion to base class, including
- GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
- GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
- GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
3. Move several LDS related helper funtions to base class, including
- GetSharedMemoryNumberOfByte
- GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1
- GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1
- GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
4. Move all c epilogue related code to base class, and 4 kind of implementation are provided
- RunEpilogueNoShuffle
- RunEpilogue
- RunMultiDEpilogue
- RunMoeEpilogue
This commit is contained in:
linqunAMD
2026-01-28 04:49:47 +08:00
committed by GitHub
parent b737f1dee5
commit 23cefda140
85 changed files with 9016 additions and 20081 deletions

View File

@@ -107,7 +107,7 @@ __device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto& ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -36,7 +36,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_d_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);

View File

@@ -39,7 +39,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,

View File

@@ -63,7 +63,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
const ABDataType* __restrict__ p_b_grid,
@@ -99,7 +99,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -42,7 +42,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_gemm_xdl_cshuffle_v1(
const A0B0B1DataType* __restrict__ p_a0_grid,

View File

@@ -33,14 +33,14 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
{
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;
@@ -82,7 +82,7 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
{
@@ -91,8 +91,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;

View File

@@ -39,7 +39,7 @@ template <typename GridwiseGemm,
bool HasMainK0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_reduce_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,
@@ -81,7 +81,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
});
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
p_a_grid + a_batch_offset,

View File

@@ -43,7 +43,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,

View File

@@ -44,7 +44,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,

View File

@@ -48,7 +48,7 @@ namespace device {
template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
{
@@ -67,7 +67,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const auto a_grid_desc_k0_m_k1 =
amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(

View File

@@ -33,14 +33,14 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
{
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;
@@ -83,8 +83,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;

View File

@@ -37,7 +37,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_abd_xdl_cshuffle(
AsPointer p_as_grid,
@@ -58,7 +58,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop>(
p_as_grid,

View File

@@ -35,7 +35,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_d_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
p_a_grid,

View File

@@ -38,7 +38,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_multiple_d_multiple_r_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -63,7 +63,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop>(
p_a_grid,

View File

@@ -37,7 +37,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
@@ -57,7 +57,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
p_a_grid,
@@ -899,7 +899,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
assert(desc.IsValid());
#endif
using GridwiseGemm = conditional_t<get_warp_size() == 64, GridwiseGemm64, GridwiseGemm32>;
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char
p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
if(desc.has_main_k_block_loop)
{
GridwiseGemm::template Run<true, InMemoryDataOperationEnum::Set>(

View File

@@ -403,14 +403,29 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
KBatch_cond_choice.value == (arg.KBatch > 1) &&
tail_num_choice.value == tail_num)
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
if constexpr(is_same_v<BLayout, tensor_layout::gemm::MFMA>)
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx_bpreshuffle< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
return;
}
else
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
}
}
});
return ave_time;

View File

@@ -323,7 +323,7 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
&occupancy_,
kernel,
BlockSize,
GridwiseGemm64::GetSharedMemoryNumberOfByte());
GridwiseGemm64::GetSharedMemoryNumberOfByteOnHost());
hip_check_error(rtn);
}
}
@@ -336,7 +336,7 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
&occupancy_,
kernel,
BlockSize,
GridwiseGemm32::GetSharedMemoryNumberOfByte());
GridwiseGemm32::GetSharedMemoryNumberOfByteOnHost());
hip_check_error(rtn);
}
}
@@ -396,7 +396,7 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
&occupancy_,
kernel,
BlockSize,
GridwiseGemm64::GetSharedMemoryNumberOfByte());
GridwiseGemm64::GetSharedMemoryNumberOfByteOnHost());
hip_check_error(rtn);
}
}
@@ -409,7 +409,7 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
&occupancy_,
kernel,
BlockSize,
GridwiseGemm32::GetSharedMemoryNumberOfByte());
GridwiseGemm32::GetSharedMemoryNumberOfByteOnHost());
hip_check_error(rtn);
}
}

View File

@@ -49,7 +49,7 @@ __launch_bounds__(CK_WAVELET_MAX_THREAD_PER_BLOCK, CK_WAVELET_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
p_b_grid,

View File

@@ -28,7 +28,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_contraction_multiple_d_xdl_cshuffle(
const void CK_CONSTANT_ADDRESS_SPACE* contraction_args,
@@ -40,7 +40,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t block_id = get_block_1d_id();

View File

@@ -80,7 +80,7 @@ template <typename GridwiseGemm,
bool CTranspose>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle(
const ABDataType* __restrict__ p_a_grid,
@@ -124,7 +124,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const long_index_t e_n_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -50,7 +50,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdlops_bwd_weight_multiple_d(
const FloatA* __restrict__ p_a_grid,

View File

@@ -82,7 +82,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
const long_index_t e_batch_offset = amd_wave_read_first_lane(
static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DispatchSplitKHack<GridwiseGemm,
AGridDesc_AK0_M_K1,
@@ -155,8 +155,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DispatchSplitKHack_2Lds<GridwiseGemm,
AGridDesc_AK0_M_K1,

View File

@@ -117,7 +117,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid,

View File

@@ -80,7 +80,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
const long_index_t e_batch_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DispatchSplitKHack<GridwiseGemm,
AGridDesc_AK0_M_K1,

View File

@@ -85,7 +85,7 @@ template <typename GridwiseGemm,
bool CTranspose>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
AsPointer p_as_grid,
@@ -120,7 +120,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const long_index_t e_n_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -116,7 +116,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
const long_index_t e_n_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
using Block2CTileMap = typename GridwiseGemm::Block2CTileMapDefault;
const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
@@ -238,8 +238,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
using Block2CTileMap = typename GridwiseGemm::Block2CTileMapDefault;
const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};

View File

@@ -173,7 +173,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
const auto rs_batch_offset = compute_ptr_offset_of_batch.GetRsPtrOffset(g_idx);
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);
const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);

View File

@@ -36,7 +36,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
const index_t group_count,
@@ -48,7 +48,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<EGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t KBatch = 1;

View File

@@ -73,7 +73,7 @@ template <typename GridwiseGemm,
BlockGemmPipelineVersion BlkGemmPipelineVer>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_gemm_multiple_d_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
const index_t group_count,
@@ -84,7 +84,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
constexpr index_t shared_size =
GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch());
__shared__ uint8_t p_shared[shared_size];
__shared__ uint8_t p_shared1[shared_size];

View File

@@ -30,7 +30,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_gemm_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
const index_t group_count,
@@ -41,7 +41,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t block_id = get_block_1d_id();

View File

@@ -38,7 +38,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
uint32_t* barrier_count,
@@ -53,7 +53,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<EGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t block_id = get_block_1d_id();

View File

@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
typename CElementwiseOperation = ck::tensor_operation::element_wise::PassThrough>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
const index_t group_count,
@@ -44,7 +44,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
{
constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
constexpr index_t shared_size =
GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch());
__shared__ uint8_t p_shared[shared_size];
const index_t block_id = get_block_1d_id();

View File

@@ -36,7 +36,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_d_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);