diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp index aa5a899779..2d8def5114 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp @@ -58,10 +58,12 @@ struct BlockwiseGemmXdlops_mx_pipeline_base //> store rows/cols into thread registers in chunks of 16 //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47] - static constexpr index_t APackedSize = is_same_v, f4x2_pk_t> - ? 2 - : 1; - static constexpr index_t KThreadChunk = 16 * APackedSize/ sizeof(ComputeTypeA); + static constexpr index_t APackedSize = + is_same_v, f4x2_pk_t> ? 2 : 1; + static constexpr index_t BPackedSize = + is_same_v, f4x2_pk_t> ? 2 : 1; + + static constexpr index_t KThreadChunk = 16 * APackedSize / sizeof(ComputeTypeA); static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops; static constexpr index_t KRepeat = KPerThread / KPack; @@ -327,18 +329,18 @@ struct BlockwiseGemmXdlops_mx_pipeline_base // Read buffer + Compute buffer // A[M0, M1, M2, KPack] static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( - make_tuple(Number{}, I1, Number{}, Number{}), - make_tuple(Number{}, - Number{}, - Number{}, + make_tuple(Number{}, I1, Number{}, Number{}), + make_tuple(Number{}, + Number{}, + Number{}, I1)); // B[N0, N1, N2, KPack] static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor( - make_tuple(Number{}, I1, Number{}, Number{}), - make_tuple(Number{}, - Number{}, - Number{}, + make_tuple(Number{}, I1, Number{}, Number{}), + make_tuple(Number{}, + Number{}, + Number{}, I1)); // C[M, N, NumRegXdlops] diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp index e9ddf8346a..04f4ee8879 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp @@ -141,7 +141,9 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, KRepeat, 1>{}([&](auto k0) { - vector_type a_thread_vec; // = vec: pk_i4_t, 32 - vector_type b_thread_vec; + vector_type + a_thread_vec; // = vec: pk_i4_t, 32 + vector_type b_thread_vec; - static_for<0, KPack / 2, 1>{}([&](auto ik) { + static_for<0, KPack / APackedSize, 1>{}([&](auto ik) { a_thread_vec.template AsType()(ik) = a_thread_buf[Number{}]; @@ -578,12 +581,14 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx(); using mfma_input_type_a = typename vector_type::type; + xdlops_gemm.K1PerXdlops / + APackedSize>::type; // mfma input type = pk_f4_t, 32 // CK_PRINT(); using mfma_input_type_b = typename vector_type::type; + xdlops_gemm.K1PerXdlops / + BPackedSize>::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -721,10 +726,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, KRepeat, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; - static_for<0, KPack / 2, 1>{}([&](auto ik) { + static_for<0, KPack / APackedSize, 1>{}([&](auto ik) { a_thread_vec.template AsType()(ik) = a_thread_buf[Number{}]; @@ -751,9 +756,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -805,10 +812,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, KRepeat, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; - static_for<0, KPack / 2, 1>{}([&](auto ik) { + static_for<0, KPack / APackedSize, 1>{}([&](auto ik) { a_thread_vec.template AsType()(ik) = a_thread_buf[Number{}]; @@ -835,9 +842,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -858,10 +867,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, KRepeat, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; - static_for<0, KPack / 2, 1>{}([&](auto ik) { + static_for<0, KPack / APackedSize, 1>{}([&](auto ik) { a_thread_vec.template AsType()(ik) = a_thread_buf[Number{}]; @@ -888,9 +897,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp index 902721c49d..500013fc79 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp @@ -344,22 +344,22 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX; + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; Run(kernel); } else { - const auto kernel = kernel_gemm_xdl_cshuffle_v3< - GridwiseGemm, - true, - InMemoryDataOperationEnum::AtomicAdd, - minimum_occupancy, - TailNumber::Even>; + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; Run(kernel); } } @@ -369,20 +369,20 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX; + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Odd>; Run(kernel); } else { const auto kernel = kernel_gemm_xdl_cshuffle_v3; + true, + InMemoryDataOperationEnum::Set, + minimum_occupancy, + TailNumber::Even>; Run(kernel); } } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 17f14c2b8f..a234c581e0 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1247,16 +1247,16 @@ struct ThreadwiseTensorSliceTransfer_v4 { // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to // DstData) - vector_type_maker_t dst_tmp_vector; + vector_type_maker_t dst_tmp_vector; // TODO: if SrcData and DstData are vetor type, then static_cast may not compile - static_for<0, SrcScalarPerVector / 2, 1>{}([&](auto i) { + static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) { dst_tmp_vector.template AsType()(i) = type_convert(src_tmp_vector.template AsType()[i]); }); // copy data from dst_tmp_vector into dst_buf - static_for<0, SrcScalarPerVector / 2, 1>{}([&](auto i) { + static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) { constexpr index_t dst_offset = dst_desc.CalculateOffset( dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);