mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
Do not use warpSize as compile time constant as it is removed (#2320)
* Do not use warpSize as compile time constant as it is removed * Update tile_image_to_column_shape.hpp update warpSize usage. * clean-up all use of warpSize, make sure code builds * fix --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: illsilin <Illia.Silin@amd.com> Co-authored-by: Bartlomiej Kocot <barkocot@amd.com>
This commit is contained in:
committed by
GitHub
parent
3af66e99ab
commit
4c57157d50
@@ -45,7 +45,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
|
||||
|
||||
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
|
||||
|
||||
// Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
|
||||
// Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
|
||||
static constexpr index_t WaveSize = 64;
|
||||
|
||||
static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
|
||||
|
||||
@@ -40,7 +40,7 @@ struct BlockwiseGemmXdlops_pipeline_base
|
||||
|
||||
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
|
||||
|
||||
// Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
|
||||
// Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
|
||||
static constexpr index_t WaveSize = 64;
|
||||
|
||||
static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
|
||||
|
||||
@@ -141,7 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
|
||||
using Base::BMmaKStride;
|
||||
|
||||
static constexpr index_t WgpPerCU =
|
||||
(4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
|
||||
(4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
|
||||
static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
|
||||
32768 / WgpPerCU,
|
||||
(MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
|
||||
@@ -631,7 +631,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
|
||||
static constexpr index_t KRepeat = KPerThread / KPerInnerLoop;
|
||||
|
||||
static constexpr index_t WgpPerCU =
|
||||
(4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
|
||||
(4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
|
||||
static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
|
||||
32768 / WgpPerCU,
|
||||
(MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
|
||||
|
||||
@@ -143,7 +143,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
|
||||
using Base::BMmaKStride;
|
||||
|
||||
static constexpr index_t WgpPerCU =
|
||||
(4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
|
||||
(4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
|
||||
static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
|
||||
32768 / WgpPerCU,
|
||||
(MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
|
||||
|
||||
@@ -141,7 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
|
||||
using Base::BMmaKStride;
|
||||
|
||||
static constexpr index_t WgpPerCU =
|
||||
(4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
|
||||
(4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
|
||||
static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
|
||||
32768 / WgpPerCU,
|
||||
(MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
|
||||
@@ -632,7 +632,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
|
||||
static constexpr index_t KRepeat = KPerThread / KPerInnerLoop;
|
||||
|
||||
static constexpr index_t WgpPerCU =
|
||||
(4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
|
||||
(4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
|
||||
static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
|
||||
32768 / WgpPerCU,
|
||||
(MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
|
||||
|
||||
Reference in New Issue
Block a user