From a2d3ef1536bcc792c934a27cfecbdc7659207029 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 3 May 2023 16:27:04 -0700 Subject: [PATCH] Fix the group of quantization_int8 kernels on MI300. (#695) * replace amd_buffer_atomic_add with hip_atomic_add * fix grouped_gemm_splitk kernels on mi300 * fix syntax * revert experimental atomic_add changes * fix the group of kernels from ticket 723 on MI300 --------- Co-authored-by: Jing Zhang [ROCm/composable_kernel commit: b8635a25b2ce87f70433b32e00858c6ae0f39fde] --- .../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 5 +++-- .../gpu/device/impl/device_gemm_multiple_d_dl.hpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp index 66b47abb62..5880f5f601 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -135,7 +135,7 @@ __global__ void const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \ - defined(__gfx90a__) || defined(__gfx908__)) + defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__)) // offset base pointer for each work-group const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); @@ -710,7 +710,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK // check device if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" || - ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908")) + ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908" || + ck::get_device_name() == "gfx940")) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp index 6368469309..4397b6f99a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp @@ -51,7 +51,7 @@ __global__ void const Block2CTileMap block_2_ctile_map) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ - defined(__gfx90a__) || defined(__gfx1030__)) + defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__)) constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType); @@ -552,7 +552,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD