From 0e877b848179f53419ef7b9690cdd86aec062abb Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 10 Apr 2022 02:05:14 +0000 Subject: [PATCH] adding thread group --- include/ck/config.hpp | 5 +---- .../gpu/block/blockwise_gemm_xdlops.hpp | 15 ++++++--------- .../blockwise_tensor_slice_transfer_v4r1.hpp | 5 +---- include/ck/utility/common_header.hpp | 1 + include/ck/utility/get_id.hpp | 7 +++++-- 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/include/ck/config.hpp b/include/ck/config.hpp index 919af1e6dd..13a0edeee9 100644 --- a/include/ck/config.hpp +++ b/include/ck/config.hpp @@ -26,17 +26,14 @@ #endif #endif -// buffer resourse, wave size +// buffer resourse #ifndef __HIP_DEVICE_COMPILE__ // for host code #define CK_BUFFER_RESOURCE_3RD_DWORD -1 -#define CK_GPU_WAVE_SIZE -1 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ defined(__gfx90a__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 -#define CK_GPU_WAVE_SIZE 64 #elif defined(__gfx1030__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 -#define CK_GPU_WAVE_SIZE 32 #endif // FMA instruction diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp index 064a763374..45c99a3134 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp @@ -1,6 +1,4 @@ -#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP -#define CK_BLOCKWISE_GEMM_XDLOPS_HPP - +#pragma once #include "common_header.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "xdlops_gemm.hpp" @@ -8,7 +6,7 @@ namespace ck { -template {}; static constexpr auto I3 = Number<3>{}; - static constexpr index_t WaveSize = 64; + static constexpr index_t WaveSize = get_warp_size(); static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); @@ -53,7 +51,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 __device__ static auto GetWaveIdx() { - const index_t thread_id = get_thread_local_1d_id(); + const index_t thread_id = ThreadGroup::GetThreadId(); constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), @@ -120,8 +118,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 BK0NK1BlockDesc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); - static_assert(BlockSize == MWaves * NWaves * WaveSize, - "BlockSize != MWaves * NWaves * WaveSize\n"); + static_assert(ThreadGroup::GetNumOfThread() == MWaves * NWaves * WaveSize, + "ThreadGroup::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0, "wrong!"); @@ -337,4 +335,3 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 }; } // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp index 5aa6600848..2358cdc98b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp @@ -1,6 +1,4 @@ -#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP -#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP - +#pragma once #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" @@ -169,4 +167,3 @@ struct BlockwiseTensorSliceTransfer_v4r1 }; } // namespace ck -#endif diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 45f387ef2a..83229b7cf1 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -27,6 +27,7 @@ #include "transpose_vectors.hpp" #include "inner_product.hpp" #include "element_wise_operation.hpp" +#include "thread_group.hpp" #include "debug.hpp" #include "amd_buffer_addressing.hpp" diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp index f742512d40..14938081e4 100644 --- a/include/ck/utility/get_id.hpp +++ b/include/ck/utility/get_id.hpp @@ -3,11 +3,14 @@ namespace ck { -__device__ constexpr index_t get_wave_size() { return CK_GPU_WAVE_SIZE; } +__host__ __device__ constexpr index_t get_warp_size() +{ // warpSize is defined by HIP + return warpSize; +} __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } -__device__ index_t get_wave_local_1d_id() { return threadIdx.x / get_wave_size(); } +__device__ index_t get_warp_local_1d_id() { return threadIdx.x / get_warp_size(); } __device__ index_t get_block_1d_id() { return blockIdx.x; }