diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 55f5620616..ebb51506b1 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -245,6 +245,12 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) namespace ck { +#if defined(__GFX9__) + __device__ static constexpr int WarpSize = 64; +#else + __device__ static constexpr int WarpSize = 32; +#endif + enum struct InMemoryDataOperationEnum { Set, diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp index 9acfd00858..c9cc9c107a 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp @@ -141,7 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_v2= 1 ? 4 * warpSize / BlockSize : 1; + (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1; static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil( 32768 / WgpPerCU, (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock); @@ -634,7 +634,7 @@ struct BlockwiseGemmXdlops_pipeline_v2= 1 ? 4 * warpSize / BlockSize : 1; + (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1; static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil( 32768 / WgpPerCU, (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock); diff --git a/include/ck/utility/workgroup_synchronization.hpp b/include/ck/utility/workgroup_synchronization.hpp index 24858fdbdc..af5b0808fb 100644 --- a/include/ck/utility/workgroup_synchronization.hpp +++ b/include/ck/utility/workgroup_synchronization.hpp @@ -32,7 +32,7 @@ static __device__ void gms_init(int NumWarps, int* p_control_bits) // all the workgroups in the synchronization group is supposed to call this function static __device__ void gms_barrier(int* p_control_bits) { - constexpr int mask = warpSize - 1; + constexpr int mask = WarpSize - 1; if((threadIdx.x & mask) == 0) {