diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp index b58e3a71b7..f366f309ff 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp @@ -35,10 +35,8 @@ struct BlockwiseGemmXdlops_mx_pipeline_base using ComputeTypeB = BDataType; using AccType = float; // for now only support V_MFMA_SCALE_F32 - static constexpr index_t APackedSize = - is_same_v, f4x2_pk_t> ? 2 : 1; - static constexpr index_t BPackedSize = - is_same_v, f4x2_pk_t> ? 2 : 1; + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -85,7 +83,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base static constexpr index_t NXdlPack = 2; static constexpr index_t KXdlPack = 2; - using HotLoopInstList = ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< + using HotLoopInstList = ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< // BlockSize, MPerBlock, NPerBlock, @@ -101,8 +99,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops, - (is_same_v, f4x2_pk_t> || - is_same_v, f4x2_pk_t>)>; + (packed_size_v > 1 || packed_size_v > 1)>; static_assert(KPerThread % KPack == 0, "Wrong KPack setting; try increasing KPerThread or decreasing KPack"); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp index c36e49a20a..2868ce2567 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp @@ -151,21 +151,8 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; int GetPreShuffleParameters() override { return NPerXDL; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp index eb289d8ddc..bb7dcae9de 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp @@ -151,21 +151,8 @@ struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; int GetPreShuffleParameters() override { return NPerXDL; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp index de22d65937..e32301fcd2 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp @@ -182,21 +182,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3 // TODO: Move this to blockwise pipeline base // KPack in packed data types for pk A/B - static constexpr index_t APackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; static constexpr index_t KPack = math::max(lcm_AK1_BK1, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp index 8ec7d5075a..a0e716ba8e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp @@ -182,21 +182,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle // TODO: Move this to blockwise pipeline base // KPack in packed data types for pk A/B - static constexpr index_t APackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; static constexpr index_t KPack = math::max(lcm_AK1_BK1, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp index 806acb1e63..94962d0c62 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp @@ -194,21 +194,8 @@ struct GridwiseMoeGemmMX static constexpr auto NXdlPack = 2; static constexpr auto KXdlPack = 2; - static constexpr index_t APackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; static constexpr bool is_single_rate_mfma = false; static constexpr auto is_scale_mfma = true; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp index 85e26150e5..8f34fe43d4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp @@ -199,21 +199,8 @@ struct GridwiseMoeGemmMXBNS static constexpr auto NXdlPack = 2; static constexpr auto KXdlPack = 2; - static constexpr index_t APackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) - return 2; - else - return 1; - }(); + static constexpr index_t APackedSize = packed_size_v; + static constexpr index_t BPackedSize = packed_size_v; static constexpr bool is_single_rate_mfma = false; static constexpr auto is_scale_mfma = true; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 1c3c69fd8b..b11e5cac04 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -90,8 +90,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_element_op_(src_element_op), dst_element_op_(dst_element_op) { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) + if constexpr((packed_size_v) > 1) { static_assert(is_same_v, remove_cvref_t>, "SrcData != DstData"); @@ -100,7 +99,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1 SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0, "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type"); - static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose"); + static_assert(SrcVectorDim == DstVectorDim, + "Packed data type does not support transpose"); } } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp index 0d3d8759db..5efa4c1f81 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -96,8 +96,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather dst_element_op_(dst_element_op), gather_offsets_(gather_offsets) { - if constexpr(is_same_v, pk_i4_t> || - is_same_v, f4x2_pk_t>) + if constexpr((packed_size_v) > 1) { static_assert(is_same_v, remove_cvref_t>, "SrcData != DstData"); @@ -107,7 +106,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type"); static_assert(SrcVectorDim == DstVectorDim, - "pk_i4_t or f4x2_pk_t does not support transpose"); + "Packed data type does not support transpose"); } }