From a9b170b54195ab667ca814f80dd5dfbf4ad772f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Mon, 2 Sep 2024 10:39:49 +0200 Subject: [PATCH 1/5] Revert "Revert "Revert Revert Support access per groups and filter2x3 in grouped conv fwd (#1382) (#1406) (#1415)" (#1455)" (#1490) This reverts commit 5ff8eeebf98b2fd3eafe95aa84bae7fd5f8d3155. --- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 116 +++++++++++++----- ...ed_conv_fwd_xdl_merged_groups_instance.hpp | 96 +++++++++++++++ .../gpu/grouped_convolution_forward.hpp | 13 ++ ..._convolution_forward_xdl_merged_groups.inc | 112 +++++++++++++++++ .../gpu/grouped_conv2d_fwd/CMakeLists.txt | 5 + ...groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 48 ++++++++ ..._groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 48 ++++++++ ..._groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 48 ++++++++ .../gpu/grouped_conv3d_fwd/CMakeLists.txt | 4 + ...ups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 47 +++++++ ...oups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 47 +++++++ ...oups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 47 +++++++ 12 files changed, 597 insertions(+), 34 deletions(-) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index bb97d29534..8619b6f1a2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -102,10 +102,9 @@ __global__ void // offset base pointer for each work-group const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y); const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z); - - const long_index_t e_batch_offset = + const long_index_t e_group_offset = amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx)); - const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx); + const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx); const long_index_t e_n_offset = amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); @@ -118,14 +117,14 @@ __global__ void DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); static_for<0, NumDTensor, 1>{}( - [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); + [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; }); if constexpr(isMultiA || isMultiB) { AsPointer p_as_grid_grp; BsPointer p_bs_grid_grp; - const auto& as_batch_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx); + const auto& as_group_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx); // compute_ptr_offset_of_n_ not need BatchStrideB so // in case of MultiA is false but isMultiB is true @@ -136,27 +135,27 @@ __global__ void static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size(); static_for<0, NumATensor, 1>{}([&](auto i) { - p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + as_n_offset[i]; + p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + as_n_offset[i]; }); } else { const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx); static_for<0, 1, 1>{}( - [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + a_n_offset; }); + [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + a_n_offset; }); } - const auto& bs_batch_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx); + const auto& bs_group_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx); static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size(); static_for<0, NumBTensor, 1>{}( - [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; }); + [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_group_offset[i]; }); GridwiseGemm::template Run( p_as_grid_grp, p_bs_grid_grp, p_ds_grid_grp, - p_e_grid + e_batch_offset + e_n_offset, + p_e_grid + e_group_offset + e_n_offset, p_shared, a_element_op, b_element_op, @@ -169,19 +168,19 @@ __global__ void } else { - const long_index_t a_batch_offset = + const long_index_t a_group_offset = amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx)); - const long_index_t b_batch_offset = + const long_index_t b_group_offset = amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx)); const long_index_t a_n_offset = amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); GridwiseGemm::template Run( - p_as_grid + a_batch_offset + a_n_offset, - p_bs_grid + b_batch_offset, + p_as_grid + a_group_offset + a_n_offset, + p_bs_grid + b_group_offset, p_ds_grid_grp, - p_e_grid + e_batch_offset + e_n_offset, + p_e_grid + e_group_offset + e_n_offset, p_shared, a_element_op, b_element_op, @@ -283,7 +282,8 @@ template + LoopScheduler LoopSched = make_default_loop_scheduler(), + index_t NumGroupsToMerge = 1> struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle : public DeviceGroupedConvFwdMultipleABD= 1); + static constexpr bool isMultiA = is_detected::value; static constexpr bool isMultiB = is_detected::value; @@ -318,7 +320,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ConvForwardSpecialization, true /*SplitN*/, ADataType, - EDataType>; + EDataType, + NumGroupsToMerge>; static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; @@ -517,7 +520,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { static_for<0, NumATensor, 1>{}([&](auto i) { // Init compute_ptr_offset_of_groups_ for multiple AB - compute_ptr_offset_of_groups_.BatchStrideA_(i) = a_g_n_c_wis_strides[0]; + compute_ptr_offset_of_groups_.BatchStrideA_(i) = + a_g_n_c_wis_strides[0] * NumGroupsToMerge; // Use GemmADataType/GemmBDataType to iterate over tuple (even if passed data // type is not tuple) @@ -545,7 +549,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle }); static_for<0, NumBTensor, 1>{}([&](auto i) { // Init compute_ptr_offset_of_groups_ for multiple AB - compute_ptr_offset_of_groups_.BatchStrideB_(i) = b_g_k_c_xs_strides[0]; + compute_ptr_offset_of_groups_.BatchStrideB_(i) = + b_g_k_c_xs_strides[0] * NumGroupsToMerge; using DataType = remove_cvref_t>; // It is possible that one of the AB is a pointer and one is a tuple. @@ -565,8 +570,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle } else { - compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0]; - compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0]; + compute_ptr_offset_of_groups_.BatchStrideA_ = + a_g_n_c_wis_strides[0] * NumGroupsToMerge; + compute_ptr_offset_of_groups_.BatchStrideB_ = + b_g_k_c_xs_strides[0] * NumGroupsToMerge; compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_; // p_as and p_bs are pointers @@ -583,7 +590,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle p_ds_grid_(i) = static_cast(p_ds[i]); // D batch stride - compute_ptr_offset_of_groups_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0]; + compute_ptr_offset_of_groups_.BatchStrideDs_(i) = + ds_g_n_k_wos_strides[i][0] * NumGroupsToMerge; compute_ptr_offset_of_n_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][1] * conv_N_per_block_; @@ -602,7 +610,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(conv_to_gemm_transformer_d); }); - compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0]; + compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0] * NumGroupsToMerge; compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_; // populate desc for Ds/E @@ -726,7 +734,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_; const index_t gdx = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_); - const index_t gdy = arg.num_group_; + const index_t gdy = arg.num_group_ / NumGroupsToMerge; const index_t gdz = num_workgroups_per_Conv_N; const auto K = @@ -850,6 +858,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { namespace ctc = tensor_layout::convolution; + const index_t G = arg.b_g_k_c_xs_lengths_[I0]; + const index_t K = arg.b_g_k_c_xs_lengths_[I1]; + const index_t C = arg.b_g_k_c_xs_lengths_[I2]; + // check device if(get_device_name() == "gfx908") { @@ -898,6 +910,42 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle } } } + else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter3x3) + { + if(C != 1) + { + return false; + } + for(index_t i = 0; i < NDimSpatial; ++i) + { + const index_t filter_spatial_dim = arg.b_g_k_c_xs_lengths_[i + I3]; + + if(filter_spatial_dim != I3) + { + return false; + } + } + if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC()) + { + return false; + } + } + + if constexpr(NumGroupsToMerge > 1) + { + if(!(C == 1)) + { + return false; + } + if(G % NumGroupsToMerge != 0) + { + return false; + } + if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC()) + { + return false; + } + } // check vector access of A // FIXME: layout @@ -907,11 +955,16 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v || is_same_v || is_same_v) { - const index_t C = arg.a_g_n_c_wis_lengths_[2]; - + // Check access per C if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0)) { - return false; + // If not possible, check access per G + if(!(ABlockTransferSrcVectorDim == 1 && C == 1 && + is_NSpatialGK_GKSpatial_NSpatialGC() && + G % ABlockTransferSrcScalarPerVector == 0)) + { + return false; + } } } else @@ -928,8 +981,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v) { - const index_t C = arg.b_g_k_c_xs_lengths_[2]; - if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0)) { return false; @@ -953,8 +1004,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v || is_same_v || is_same_v || is_same_v) { - const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; - if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0)) { valid = false; @@ -999,8 +1048,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v || is_same_v || is_same_v) { - const index_t K = arg.e_g_n_k_wos_lengths_[2]; - if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0)) { return false; @@ -1298,7 +1345,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle << BBlockTransferSrcScalarPerVector << ", " << CDEBlockTransferScalarPerVector_NPerBlock << ", " << CShuffleMXdlPerWavePerShuffle << ", " - << CShuffleNXdlPerWavePerShuffle + << CShuffleNXdlPerWavePerShuffle << ", " + << NumGroupsToMerge << ">"; // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp new file mode 100644 index 0000000000..96baf6bb00 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using F32 = float; + +template +using S = ck::Sequence; + +using Empty_Tuple = ck::Tuple<>; + +using namespace ck::tensor_layout::convolution; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto ConvFwd3x3 = ConvolutionForwardSpecialization::Filter3x3; + +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +template +using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Type| Type| Pipeline| ToMerge| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | Scheduler| | + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // Instances with NumGroupsPerBatch > 1 + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, BF16, BF16, LoopScheduler::Default, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, BF16, BF16, LoopScheduler::Default, 16>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, BF16, BF16, LoopScheduler::Default, 32> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // Instances with NumGroupsPerBatch > 1 + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F16, F16, LoopScheduler::Default, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F16, F16, LoopScheduler::Default, 16>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F16, F16, LoopScheduler::Default, 32> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // Instances with NumGroupsPerBatch > 1 + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F32, F32, LoopScheduler::Default, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F32, F32, LoopScheduler::Default, 16>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S< 4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 4>, 1, F32, F32, LoopScheduler::Default, 32> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index a5cf521db1..4e117f86aa 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -18,6 +18,7 @@ #ifdef CK_USE_XDL #include "grouped_convolution_forward_xdl.inc" #include "grouped_convolution_forward_xdl_large_tensor.inc" +#include "grouped_convolution_forward_xdl_merged_groups.inc" #include "grouped_convolution_forward_comp_xdl.inc" #include "grouped_convolution_forward_mem_inter_xdl.inc" #include "grouped_convolution_forward_mem_intra_xdl.inc" @@ -202,6 +203,8 @@ struct DeviceOperationInstanceFactory>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector>>& instances); +#endif + +#ifdef CK_ENABLE_BF16 +// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector>>& instances); +#endif + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt index 5c12dda5ea..095e02795b 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt @@ -14,6 +14,11 @@ add_instance_library(device_grouped_conv2d_fwd_instance xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp + # merged groups + # NHWGC, GKYXC, NHWGK + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp + xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp #mem # NHWGC, GKYXC, NHWGK xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp new file mode 100644 index 0000000000..6fa4bc6e46 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp new file mode 100644 index 0000000000..9fa56f48c7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp new file mode 100644 index 0000000000..e226dae975 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwdDefault>{}); + + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, + Empty_Tuple, + NHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt index 5a346f1e94..1e572f9ceb 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt @@ -13,6 +13,10 @@ set(GROUPED_CONV3D_FWD xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp + xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp + xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp + xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp + xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp new file mode 100644 index 0000000000..cf1fcec985 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp new file mode 100644 index 0000000000..bea62892d3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp new file mode 100644 index 0000000000..de44725413 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, + Empty_Tuple, + NDHWGK, + ConvFwd3x3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck From 73b67f290f6602fe0461d48a2c103de460f14084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 3 Sep 2024 10:52:03 +0200 Subject: [PATCH 2/5] Add support for NGCHW in grouped conv bwd wei (#1491) * Add support for NGCHW in grouped conv bwd wei * Comments fixes * navi fixes * Update function names --- .../device_grouped_conv_bwd_weight_dl.hpp | 12 +- ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp | 10 +- ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 412 ++++++++++++++++-- ..._grouped_conv_bwd_weight_wmma_cshuffle.hpp | 4 +- ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 10 +- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 6 +- ...d_multiple_d_xdl_large_tensor_cshuffle.hpp | 2 +- .../device/impl/device_grouped_conv_utils.hpp | 44 +- .../gpu/device/tensor_layout.hpp | 34 +- .../gpu/grid/gridwise_elementwise_2d.hpp | 54 ++- .../device_operation_instance_factory.hpp | 8 + ...conv_bwd_weight_two_stage_xdl_instance.hpp | 40 ++ .../grouped_convolution_backward_weight.hpp | 30 ++ ...rouped_convolution_backward_weight_xdl.inc | 46 ++ ...volution_host_tensor_descriptor_helper.hpp | 42 +- .../grouped_conv2d_bwd_weight/CMakeLists.txt | 2 + ..._ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp | 41 ++ ..._ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp | 41 ++ .../grouped_conv3d_bwd_weight/CMakeLists.txt | 2 + ...cdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp | 41 ++ ...cdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp | 41 ++ .../src/profile_grouped_conv_bwd_weight.cpp | 15 + script/convert_miopen_driver_to_profiler.py | 33 +- .../test_grouped_convnd_bwd_weight.cpp | 12 +- 24 files changed, 893 insertions(+), 89 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp index cc26936fef..0b3f1a0255 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp @@ -1039,14 +1039,14 @@ struct DeviceGroupedConvBwdWeight_Dl : public DeviceGroupedConvBwdWeight() || - is_GNWK_GKXC_GNWC())) || + (is_NWGC_GKXC_NWGK() || + is_GNWC_GKXC_GNWK())) || (NDimSpatial == 2 && - (is_NHWGK_GKYXC_NHWGC() || - is_GNHWK_GKYXC_GNHWC())) || + (is_NHWGC_GKYXC_NHWGK() || + is_GNHWC_GKYXC_GNHWK())) || (NDimSpatial == 3 && - (is_NDHWGK_GKZYXC_NDHWGC() || - is_GNDHWK_GKZYXC_GNDHWC())))) + (is_NDHWGC_GKZYXC_NDHWGK() || + is_GNDHWC_GKZYXC_GNDHWK())))) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp index 7f88ea692a..a7df1c9d57 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp @@ -864,23 +864,23 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle } if constexpr(NDimSpatial == 1) { - if constexpr(!is_GNWK_GKXC_GNWC()) + if constexpr(!is_GNWC_GKXC_GNWK()) { return false; } } else if constexpr(NDimSpatial == 2) { - if constexpr(!(is_NHWGK_GKYXC_NHWGC() || - is_GNHWK_GKYXC_GNHWC())) + if constexpr(!(is_NHWGC_GKYXC_NHWGK() || + is_GNHWC_GKYXC_GNHWK())) { return false; } } else if constexpr(NDimSpatial == 3) { - if constexpr(!(is_NDHWGK_GKZYXC_NDHWGC() || - is_GNDHWK_GKZYXC_GNDHWC())) + if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK() || + is_GNDHWC_GKZYXC_GNDHWK())) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index e18b8b9e28..cb2ef915c0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -22,6 +22,7 @@ #include #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -191,7 +192,9 @@ template + typename ComputeTypeB = ComputeTypeA, + index_t TransposeTransferSrcScalarPerVector = 1, + index_t TransposeTransferDstScalarPerVector = 1> struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle : public DeviceGroupedConvBwdWeight() || + is_NGCDHW_GKZYXC_NGKDHW()) || + is_same_v); + using AElementwiseOperation = OutElementwiseOperation; using BElementwiseOperation = InElementwiseOperation; using CDEElementwiseOperation = WeiElementwiseOperation; @@ -351,6 +359,142 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle batch)[I2]; } + static constexpr index_t ClusterLengthMPerBlock = + CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1); + static constexpr index_t ClusterLengthNPerBlock = + CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3); + + template ::type = false> + static auto MakeInputTransposeDesc(std::array g_n_c_wis_lengths, + std::array g_n_c_wis_strides) + { + const index_t& G = g_n_c_wis_lengths[0]; + const index_t& N = g_n_c_wis_lengths[1]; + const index_t& C = g_n_c_wis_lengths[2]; + const index_t& Hi = g_n_c_wis_lengths[3]; + const index_t& Wi = g_n_c_wis_lengths[4]; + + const index_t& GStride = g_n_c_wis_strides[0]; + const index_t& NStride = g_n_c_wis_strides[1]; + const index_t& CStride = g_n_c_wis_strides[2]; + const index_t& HiStride = g_n_c_wis_strides[3]; + const index_t& WiStride = g_n_c_wis_strides[4]; + + const auto desc = make_naive_tensor_descriptor( + make_tuple(N, G, C, Hi, Wi), make_tuple(NStride, GStride, CStride, HiStride, WiStride)); + const auto merged_desc = + transform_tensor_descriptor(desc, + make_tuple(make_merge_transform(make_tuple(N, G, C)), + make_merge_transform(make_tuple(Hi, Wi))), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return PadTensorDescriptor( + merged_desc, + make_tuple(MPerBlock / ClusterLengthMPerBlock, NPerBlock / ClusterLengthNPerBlock), + Sequence{}); + } + + template ::type = false> + static auto MakeOutputTransposeDesc(std::array g_n_c_wis_lengths, + std::array g_n_c_wis_strides) + { + const index_t& G = g_n_c_wis_lengths[0]; + const index_t& N = g_n_c_wis_lengths[1]; + const index_t& C = g_n_c_wis_lengths[2]; + const index_t& Hi = g_n_c_wis_lengths[3]; + const index_t& Wi = g_n_c_wis_lengths[4]; + + const index_t& NStride = g_n_c_wis_strides[1]; + const index_t HiStride = Wi * G * C; + const index_t WiStride = G * C; + const index_t GStride = C; + const index_t CStride = 1; + + const auto desc = make_naive_tensor_descriptor( + make_tuple(N, G, C, Hi, Wi), make_tuple(NStride, GStride, CStride, HiStride, WiStride)); + const auto merged_desc = + transform_tensor_descriptor(desc, + make_tuple(make_merge_transform(make_tuple(N, G, C)), + make_merge_transform(make_tuple(Hi, Wi))), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return PadTensorDescriptor( + merged_desc, + make_tuple(MPerBlock / ClusterLengthMPerBlock, NPerBlock / ClusterLengthNPerBlock), + Sequence{}); + } + + template ::type = false> + static auto MakeInputTransposeDesc(std::array g_n_c_wis_lengths, + std::array g_n_c_wis_strides) + { + const index_t& G = g_n_c_wis_lengths[0]; + const index_t& N = g_n_c_wis_lengths[1]; + const index_t& C = g_n_c_wis_lengths[2]; + const index_t& Di = g_n_c_wis_lengths[3]; + const index_t& Hi = g_n_c_wis_lengths[4]; + const index_t& Wi = g_n_c_wis_lengths[5]; + + const index_t& GStride = g_n_c_wis_strides[0]; + const index_t& NStride = g_n_c_wis_strides[1]; + const index_t& CStride = g_n_c_wis_strides[2]; + const index_t& DiStride = g_n_c_wis_strides[3]; + const index_t& HiStride = g_n_c_wis_strides[4]; + const index_t& WiStride = g_n_c_wis_strides[5]; + + const auto desc = make_naive_tensor_descriptor( + make_tuple(N, G, C, Di, Hi, Wi), + make_tuple(NStride, GStride, CStride, DiStride, HiStride, WiStride)); + const auto merged_desc = + transform_tensor_descriptor(desc, + make_tuple(make_merge_transform(make_tuple(N, G, C)), + make_merge_transform(make_tuple(Di, Hi, Wi))), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return PadTensorDescriptor( + merged_desc, + make_tuple(MPerBlock / ClusterLengthMPerBlock, NPerBlock / ClusterLengthNPerBlock), + Sequence{}); + } + + template ::type = false> + static auto MakeOutputTransposeDesc(std::array g_n_c_wis_lengths, + std::array g_n_c_wis_strides) + { + const index_t& G = g_n_c_wis_lengths[0]; + const index_t& N = g_n_c_wis_lengths[1]; + const index_t& C = g_n_c_wis_lengths[2]; + const index_t& Di = g_n_c_wis_lengths[3]; + const index_t& Hi = g_n_c_wis_lengths[4]; + const index_t& Wi = g_n_c_wis_lengths[5]; + + const index_t& NStride = g_n_c_wis_strides[1]; + const index_t DiStride = Hi * Wi * G * C; + const index_t HiStride = Wi * G * C; + const index_t WiStride = G * C; + const index_t GStride = C; + const index_t CStride = 1; + + const auto desc = make_naive_tensor_descriptor( + make_tuple(N, G, C, Di, Hi, Wi), + make_tuple(NStride, GStride, CStride, DiStride, HiStride, WiStride)); + const auto merged_desc = + transform_tensor_descriptor(desc, + make_tuple(make_merge_transform(make_tuple(N, G, C)), + make_merge_transform(make_tuple(Di, Hi, Wi))), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return PadTensorDescriptor( + merged_desc, + make_tuple(MPerBlock / ClusterLengthMPerBlock, NPerBlock / ClusterLengthNPerBlock), + Sequence{}); + } + + using InputTransposeDescType = + remove_cvref_t({}, {}))>; + using OutputTransposeDescType = + remove_cvref_t({}, {}))>; + using ABCGridDescs = decltype(GetABCGridDesc()); using AGridDesc_K0_M_K1 = remove_cvref_t; @@ -407,13 +551,9 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle ComputeTypeA, ComputeTypeB>; - static constexpr index_t ClusterLengthMPerBlock = - CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1); - static constexpr index_t ClusterLengthNPerBlock = - CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3); using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt; - using GridwiseElementwise = + using GridwiseElementwiseCast = GridwiseElementwise, Tuple, Tuple, @@ -431,6 +571,24 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle I1, I1>; + using GridwiseElementwiseTranspose = + GridwiseElementwise, + Tuple, + Tuple, + Tuple, + Block2TileMapElementwise, + element_wise::PassThrough, + BlockSize, + MPerBlock, + NPerBlock, + MPerBlock / ClusterLengthMPerBlock, + NPerBlock / ClusterLengthNPerBlock, + Sequence<1, 0>, + Sequence, + Sequence, + I1, + I0>; + // Argument using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( @@ -493,6 +651,45 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle end(a_g_n_k_wos_lengths), begin(output_spatial_lengths_)); + std::array b_g_n_c_wis_strides_transposed = + b_g_n_c_wis_strides; + std::array a_g_n_k_wos_strides_transposed = + a_g_n_k_wos_strides; + + // NGKHW - transpose needed + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + b_g_n_c_wis_strides_transposed[I0] = Conv_C_; + b_g_n_c_wis_strides_transposed[I2] = I1; + a_g_n_k_wos_strides_transposed[I0] = Conv_K_; + a_g_n_k_wos_strides_transposed[I2] = I1; + + if constexpr(NDimSpatial == 2) + { + b_g_n_c_wis_strides_transposed[I3] = + input_spatial_lengths_[I1] * Conv_G_ * Conv_C_; + b_g_n_c_wis_strides_transposed[I4] = Conv_G_ * Conv_C_; + a_g_n_k_wos_strides_transposed[I3] = + output_spatial_lengths_[I1] * Conv_G_ * Conv_K_; + a_g_n_k_wos_strides_transposed[I4] = Conv_G_ * Conv_K_; + } + else if constexpr(NDimSpatial == 3) + { + b_g_n_c_wis_strides_transposed[I3] = + input_spatial_lengths_[I1] * input_spatial_lengths_[I2] * Conv_G_ * Conv_C_; + b_g_n_c_wis_strides_transposed[I4] = + input_spatial_lengths_[I2] * Conv_G_ * Conv_C_; + b_g_n_c_wis_strides_transposed[I5] = Conv_G_ * Conv_C_; + a_g_n_k_wos_strides_transposed[I3] = output_spatial_lengths_[I1] * + input_spatial_lengths_[I2] * Conv_G_ * + Conv_K_; + a_g_n_k_wos_strides_transposed[I4] = + input_spatial_lengths_[I2] * Conv_G_ * Conv_K_; + a_g_n_k_wos_strides_transposed[I5] = Conv_G_ * Conv_K_; + } + } + const auto descs = conv_to_gemm_transformer_v2 .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N( @@ -502,9 +699,9 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle input_spatial_lengths_, filter_spatial_lengths_, output_spatial_lengths_, - b_g_n_c_wis_strides, + b_g_n_c_wis_strides_transposed, e_g_k_c_xs_strides, - a_g_n_k_wos_strides, + a_g_n_k_wos_strides_transposed, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -540,8 +737,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle const index_t GemmN = b_grid_desc_k0_n_k1_.GetLength(I1); // A/B/C Batch Stride - compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0]; - compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0]; + compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0]; compute_ptr_offset_of_batch_.BatchStrideC_ = Conv_K_ * Conv_C_ * std::accumulate(begin(filter_spatial_lengths_), @@ -553,11 +750,56 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle ce_grid_desc_m_n_, GridwiseGemm::CalculateMBlock(GemmM), GridwiseGemm::CalculateNBlock(GemmN)); + + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + a_in_transpose_desc_ = + MakeInputTransposeDesc(a_g_n_k_wos_lengths, a_g_n_k_wos_strides); + a_out_transpose_desc_ = + MakeOutputTransposeDesc(a_g_n_k_wos_lengths, a_g_n_k_wos_strides); + + b_in_transpose_desc_ = + MakeInputTransposeDesc(b_g_n_c_wis_lengths, b_g_n_c_wis_strides); + b_out_transpose_desc_ = + MakeOutputTransposeDesc(b_g_n_c_wis_lengths, b_g_n_c_wis_strides); + + elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{ + a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)}; + + elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{ + b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)}; + } + } + + std::size_t GetWorkspaceATensorSizeBytes() const + { + return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize(); + } + + std::size_t GetWorkspaceBTensorSizeBytes() const + { + return sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize(); + } + + std::size_t GetWorkspaceETensorSizeBytes() const + { + return sizeof(AccDataType) * ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_; } std::size_t GetWorkspaceSizeBytes() const { - return sizeof(AccDataType) * ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_; + // Transpose require workspace for A and B + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() + + GetWorkspaceETensorSizeBytes(); + } + else + { + return GetWorkspaceETensorSizeBytes(); + } } const ADataType* p_a_grid_; @@ -571,6 +813,11 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_; Block2TileMapElementwise elementwise_block_2_ctile_map_; + Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_, + elementwise_block_2_ctile_map_transpose_b_; + + InputTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_; + OutputTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_; // for computing batch offset ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; @@ -624,17 +871,23 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle AccDataType* p_c_grid = type_convert(arg.p_workspace_); + const ADataType* p_a_grid = arg.p_a_grid_; + const BDataType* p_b_grid = arg.p_b_grid_; + + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + p_a_grid = type_convert(arg.p_workspace_) + + arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType); + p_b_grid = + type_convert(arg.p_workspace_) + + (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) / + sizeof(BDataType); + } + // nullptr for output, will be set after workspace set - typename GridwiseGemm::Argument gemm_arg{arg.p_a_grid_, - arg.p_b_grid_, - p_c_grid, - GemmM, - GemmN, - GemmK, - I0, - I0, - I0, - arg.k_batch_}; + typename GridwiseGemm::Argument gemm_arg{ + p_a_grid, p_b_grid, p_c_grid, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_}; index_t gdx, gdy, gdz; std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize( @@ -651,8 +904,10 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle arg.a_grid_desc_k0_m_k1_.GetLength(Number<0>{}) / gemm_arg.KBatch; const auto clear_workspace = [&]() { - hip_check_error(hipMemsetAsync( - gemm_arg.p_c_grid, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_)); + hip_check_error(hipMemsetAsync(gemm_arg.p_c_grid, + 0, + arg.GetWorkspaceETensorSizeBytes(), + stream_config.stream_id_)); }; const auto Run = [&](const auto& kernel) { @@ -1261,6 +1516,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + float avg_time = 0.f; auto launch_elementwise_kernel = [&]() { const AccDataType* p_c_grid = type_convert(arg.p_workspace_); const index_t grid_size = arg.elementwise_block_2_ctile_map_.CalculateGridSize( @@ -1270,7 +1526,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle std::array in_out_batch_strides = { static_cast(arg.compute_ptr_offset_of_batch_.BatchStrideC_)}; - const auto kernel = kernel_batched_elementwise, ck::Tuple, ck::Tuple, @@ -1296,7 +1552,54 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle in_out_batch_strides); }; - float avg_time = RunGemmV3(arg, stream_config); + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + const index_t grid_size_a = + arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize( + arg.a_in_transpose_desc_); + const index_t grid_size_b = + arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize( + arg.b_in_transpose_desc_); + + ADataType* p_a_out_grid = type_convert(arg.p_workspace_) + + arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType); + BDataType* p_b_out_grid = + type_convert(arg.p_workspace_) + + (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) / + sizeof(BDataType); + + auto kernel_transpose = kernel_elementwise_dual, + ck::Tuple, + ck::Tuple, + ck::Tuple, + ck::Tuple, + ck::Tuple, + Block2TileMapElementwise, + Block2TileMapElementwise, + element_wise::PassThrough>; + + avg_time += launch_and_time_kernel(stream_config, + kernel_transpose, + dim3(grid_size_a + grid_size_b), + dim3(BlockSize), + 0, + make_tuple(arg.a_in_transpose_desc_), + make_tuple(arg.b_in_transpose_desc_), + make_tuple(arg.a_out_transpose_desc_), + make_tuple(arg.b_out_transpose_desc_), + make_tuple(arg.p_a_grid_), + make_tuple(arg.p_b_grid_), + make_tuple(p_a_out_grid), + make_tuple(p_b_out_grid), + arg.elementwise_block_2_ctile_map_transpose_a_, + arg.elementwise_block_2_ctile_map_transpose_b_, + element_wise::PassThrough{}, + grid_size_a); + } + + avg_time += RunGemmV3(arg, stream_config); avg_time += launch_elementwise_kernel(); return avg_time; } @@ -1347,25 +1650,18 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle { return false; } - if constexpr(NDimSpatial == 1) + if constexpr(NDimSpatial == 2) { - if constexpr(!is_GNWK_GKXC_GNWC()) - { - return false; - } - } - else if constexpr(NDimSpatial == 2) - { - if constexpr(!(is_NHWGK_GKYXC_NHWGC() || - is_GNHWK_GKYXC_GNHWC())) + if constexpr(!(is_NHWGC_GKYXC_NHWGK() || + is_NGCHW_GKYXC_NGKHW())) { return false; } } else if constexpr(NDimSpatial == 3) { - if constexpr(!(is_NDHWGK_GKZYXC_NDHWGC() || - is_GNDHWK_GKZYXC_GNDHWC())) + if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK() || + is_NGCDHW_GKZYXC_NGKDHW())) { return false; } @@ -1431,6 +1727,35 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle return false; } + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) + { + if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVector != 0) + { + return false; + } + + if((arg.Conv_G_ * arg.Conv_K_) % TransposeTransferDstScalarPerVector != 0) + { + return false; + } + + const index_t input_spatial_acum = ck::accumulate_n( + arg.input_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + const index_t output_spatial_acum = ck::accumulate_n( + arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + + if(input_spatial_acum % TransposeTransferSrcScalarPerVector != 0) + { + return false; + } + + if(output_spatial_acum % TransposeTransferSrcScalarPerVector != 0) + { + return false; + } + } + return true; } @@ -1563,8 +1888,17 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", " << "BlkGemmPipelineVersion: " << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", " - << NumGroupsToMerge - << ">"; + << NumGroupsToMerge; + + if constexpr(is_NGCHW_GKYXC_NGKHW() || + is_NGCDHW_GKZYXC_NGKDHW()) { + str << ", TransposeTransferSrcScalarPerVector: " + << TransposeTransferSrcScalarPerVector <<", " + << "TransposeTransferDstScalarPerVector: " << TransposeTransferDstScalarPerVector; + } + + + str << ">"; // clang-format on return str.str(); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp index 5738be0fb3..0831b754c8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp @@ -710,8 +710,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle return false; } - if constexpr(!(is_NDHWGK_GKZYXC_NDHWGC() || - is_GNDHWK_GKZYXC_GNDHWC())) + if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK() || + is_GNDHWC_GKZYXC_GNDHWK())) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index 3babd1896f..996107343d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -586,23 +586,23 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle } if constexpr(NDimSpatial == 1) { - if constexpr(!is_GNWK_GKXC_GNWC()) + if constexpr(!is_GNWC_GKXC_GNWK()) { return false; } } else if constexpr(NDimSpatial == 2) { - if constexpr(!(is_NHWGK_GKYXC_NHWGC() || - is_GNHWK_GKYXC_GNHWC())) + if constexpr(!(is_NHWGC_GKYXC_NHWGK() || + is_GNHWC_GKYXC_GNHWK())) { return false; } } else if constexpr(NDimSpatial == 3) { - if constexpr(!(is_NDHWGK_GKZYXC_NDHWGC() || - is_GNDHWK_GKZYXC_GNDHWC())) + if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK() || + is_GNDHWC_GKZYXC_GNDHWK())) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 8619b6f1a2..1b58280c30 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -925,7 +925,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle return false; } } - if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC()) + if constexpr(!is_NSpatialGC_GKSpatial_NSpatialGK()) { return false; } @@ -941,7 +941,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { return false; } - if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC()) + if constexpr(!is_NSpatialGC_GKSpatial_NSpatialGK()) { return false; } @@ -960,7 +960,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { // If not possible, check access per G if(!(ABlockTransferSrcVectorDim == 1 && C == 1 && - is_NSpatialGK_GKSpatial_NSpatialGC() && + is_NSpatialGC_GKSpatial_NSpatialGK() && G % ABlockTransferSrcScalarPerVector == 0)) { return false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp index 845751c51c..bc7f13441a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp @@ -713,7 +713,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor return false; } } - if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC()) + if constexpr(!is_NSpatialGC_GKSpatial_NSpatialGK()) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp index 3ee02558f4..29f70c417d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp @@ -12,7 +12,7 @@ namespace device { // 1d template -constexpr bool is_NWGK_GKXC_NWGC() +constexpr bool is_NWGC_GKXC_NWGK() { return is_same_v && is_same_v && @@ -20,7 +20,7 @@ constexpr bool is_NWGK_GKXC_NWGC() } template -constexpr bool is_GNWK_GKXC_GNWC() +constexpr bool is_GNWC_GKXC_GNWK() { return is_same_v && is_same_v && @@ -28,7 +28,7 @@ constexpr bool is_GNWK_GKXC_GNWC() } // 2d template -constexpr bool is_NHWGK_GKYXC_NHWGC() +constexpr bool is_NHWGC_GKYXC_NHWGK() { return is_same_v && is_same_v && @@ -36,15 +36,23 @@ constexpr bool is_NHWGK_GKYXC_NHWGC() } template -constexpr bool is_GNHWK_GKYXC_GNHWC() +constexpr bool is_GNHWC_GKYXC_GNHWK() { return is_same_v && is_same_v && is_same_v; } + +template +constexpr bool is_NGCHW_GKYXC_NGKHW() +{ + return is_same_v && + is_same_v && + is_same_v; +} // 3d template -constexpr bool is_NDHWGK_GKZYXC_NDHWGC() +constexpr bool is_NDHWGC_GKZYXC_NDHWGK() { return is_same_v && is_same_v && @@ -52,7 +60,7 @@ constexpr bool is_NDHWGK_GKZYXC_NDHWGC() } template -constexpr bool is_GNDHWK_GKZYXC_GNDHWC() +constexpr bool is_GNDHWC_GKZYXC_GNDHWK() { return is_same_v && is_same_v && @@ -60,19 +68,27 @@ constexpr bool is_GNDHWK_GKZYXC_GNDHWC() } template -constexpr bool is_NSpatialGK_GKSpatial_NSpatialGC() +constexpr bool is_NGCDHW_GKZYXC_NGKDHW() { - return is_NWGK_GKXC_NWGC() || - is_NHWGK_GKYXC_NHWGC() || - is_NDHWGK_GKZYXC_NDHWGC(); + return is_same_v && + is_same_v && + is_same_v; } template -constexpr bool is_GNSpatialK_GKSpatial_GNSpatialC() +constexpr bool is_NSpatialGC_GKSpatial_NSpatialGK() { - return is_GNWK_GKXC_GNWC() || - is_GNHWK_GKYXC_GNHWC() || - is_GNDHWK_GKZYXC_GNDHWC(); + return is_NWGC_GKXC_NWGK() || + is_NHWGC_GKYXC_NHWGK() || + is_NDHWGC_GKZYXC_NDHWGK(); +} + +template +constexpr bool is_GNSpatialC_GKSpatial_GNSpatialK() +{ + return is_GNWC_GKXC_GNWK() || + is_GNHWC_GKYXC_GNHWK() || + is_GNDHWC_GKZYXC_GNDHWK(); } template diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index ecc71ba2f2..2202bc5695 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -115,6 +115,23 @@ struct NDHWGC : public BaseTensorLayout static constexpr const char* name = "NDHWGC"; }; +// input tensor +// packed NGCW/NGCHW/NGCDHW +struct NGCW : public BaseTensorLayout +{ + static constexpr const char* name = "NGCW"; +}; + +struct NGCHW : public BaseTensorLayout +{ + static constexpr const char* name = "NGCHW"; +}; + +struct NGCDHW : public BaseTensorLayout +{ + static constexpr const char* name = "NGCDHW"; +}; + // input tensor // strided layout struct G_NW_C : public BaseTensorLayout @@ -325,6 +342,21 @@ struct NDHWGK : public BaseTensorLayout static constexpr const char* name = "NDHWGK"; }; +struct NGKW : public BaseTensorLayout +{ + static constexpr const char* name = "NGKW"; +}; + +struct NGKHW : public BaseTensorLayout +{ + static constexpr const char* name = "NGKHW"; +}; + +struct NGKDHW : public BaseTensorLayout +{ + static constexpr const char* name = "NGKDHW"; +}; + // output tensor // strided layout struct G_NW_K : public BaseTensorLayout diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp index 3439aefa42..2bbe4c8ce8 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp @@ -41,6 +41,55 @@ __global__ void elementwise_op); } +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_elementwise_dual(const InBGridDescTuple in_grid_desc_tuple_a, + const InBGridDescTuple in_grid_desc_tuple_b, + const OutAGridDescTuple out_grid_desc_tuple_a, + const OutBGridDescTuple out_grid_desc_tuple_b, + const InDataTypePointerTuple p_in_global_tuple_a, + const InDataTypePointerTuple p_in_global_tuple_b, + const OutDataTypePointerTuple p_out_global_tuple_a, + const OutDataTypePointerTuple p_out_global_tuple_b, + const Block2TileMapA block_2_tile_map_a, + const Block2TileMapB block_2_tile_map_b, + const ElementwiseOperation elementwise_op, + const index_t a_grid_size) +{ + if(get_block_1d_id() < a_grid_size) + { + GridwiseElementwiseFunctor::Run(in_grid_desc_tuple_a, + out_grid_desc_tuple_a, + p_in_global_tuple_a, + p_out_global_tuple_a, + block_2_tile_map_a, + elementwise_op, + get_block_1d_id()); + } + else + { + GridwiseElementwiseFunctor::Run(in_grid_desc_tuple_b, + out_grid_desc_tuple_b, + p_in_global_tuple_b, + p_out_global_tuple_b, + block_2_tile_map_b, + elementwise_op, + get_block_1d_id() - a_grid_size); + } +} + template {}); const auto block_work_idx = - block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id)); const index_t m0_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * M0PerBlock); diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 27758deb44..4358953a58 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -74,6 +74,10 @@ using GNWK = ck::tensor_layout::convolution::GNWK; using GNHWK = ck::tensor_layout::convolution::GNHWK; using GNDHWK = ck::tensor_layout::convolution::GNDHWK; +using NGKW = ck::tensor_layout::convolution::NGKW; +using NGKHW = ck::tensor_layout::convolution::NGKHW; +using NGKDHW = ck::tensor_layout::convolution::NGKDHW; + // using NWGC = ck::tensor_layout::convolution::NWGC; using NHWGC = ck::tensor_layout::convolution::NHWGC; @@ -87,6 +91,10 @@ using NWGK = ck::tensor_layout::convolution::NWGK; using NHWGK = ck::tensor_layout::convolution::NHWGK; using NDHWGK = ck::tensor_layout::convolution::NDHWGK; +using NGCW = ck::tensor_layout::convolution::NGCW; +using NGCHW = ck::tensor_layout::convolution::NGCHW; +using NGCDHW = ck::tensor_layout::convolution::NGCDHW; + // using G_K = ck::tensor_layout::convolution::G_K; using GK_Tuple = ck::Tuple; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp index 41303d2e95..2ce334d9d7 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp @@ -56,6 +56,46 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std // clang-format on >; +// NGCHW requires transpose, we use vector loads and stores params for them +template +using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances = std::tuple< + // clang-format off + //#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups| + //#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge| + //#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| | + //#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | | + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 16, 16, 32, 8, 16, 16, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 1, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, F16, F16, 1, 2>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 32, 32, 8, 32, 32, 1, 1, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 2, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 2, F16, F16, 2, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 64, 32, 8, 32, 32, 1, 2, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 32, 128, 32, 8, 32, 32, 1, 4, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 4, 1, 8>, 1, Scheduler, PipelineVersion, 8, F16, F16, 8 ,1>, + + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 64, 32, 32, 8, 32, 32, 2, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 8, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 8, 1, 8>, 1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>, + DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64, 128, 32, 32, 8, 32, 32, 4, 1, S<4, 16, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, S<4, 4, 1>, S<2, 0, 1>, S<1, 0, 2>, 1, 8, 8, false, 1, 1, S<1, 8, 1, 4>, 1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp index 5a703e5814..0f11d337f5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp @@ -367,6 +367,21 @@ struct DeviceOperationInstanceFactory && is_same_v && + is_same_v) + { +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances( + op_ptrs); + add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances( + op_ptrs); + } #endif } } @@ -447,6 +462,21 @@ struct DeviceOperationInstanceFactory && is_same_v && + is_same_v) + { +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances( + op_ptrs); + add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances( + op_ptrs); + } #endif } } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc index b4bea8485d..f240fa3235 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc @@ -137,6 +137,29 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi PassThrough, PassThrough, PassThrough>>>& instances); +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances( + std::vector>>& instances); #endif #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances( @@ -240,6 +263,29 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16 PassThrough, PassThrough, PassThrough>>>& instances); +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances( + std::vector>>& instances); + +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances( + std::vector>>& instances); #endif #ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp index ff697fb71c..d4ceefb458 100644 --- a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp +++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -46,6 +46,21 @@ std::vector get_layout_transpose_gnchw_to_old() { return {0, 1, 2, 3}; } + else if constexpr(ck::is_same_v || + ck::is_same_v) + { + return {1, 0, 2, 3}; + } + else if constexpr(ck::is_same_v || + ck::is_same_v) + { + return {1, 0, 2, 3, 4}; + } + else if constexpr(ck::is_same_v || + ck::is_same_v) + { + return {1, 0, 2, 3, 4, 5}; + } else if constexpr(ck::is_same_v || ck::is_same_v || ck::is_same_v) @@ -132,6 +147,18 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck::utils::conv::ConvPa param.input_spatial_lengths_.begin() + param.num_dim_spatial_); } // separate from legacy code above + else if constexpr(ck::is_same_v || + ck::is_same_v || + ck::is_same_v) + { + physical_lengths = std::vector{static_cast(param.N_), + static_cast(param.G_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.end(), + param.input_spatial_lengths_.begin(), + param.input_spatial_lengths_.begin() + param.num_dim_spatial_); + } else if constexpr(ck::is_same_v || ck::is_same_v || ck::is_same_v) @@ -314,6 +341,19 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvP param.output_spatial_lengths_.begin(), param.output_spatial_lengths_.begin() + param.num_dim_spatial_); } + // separate from legacy code above + else if constexpr(ck::is_same_v || + ck::is_same_v || + ck::is_same_v) + { + physical_lengths = std::vector{static_cast(param.N_), + static_cast(param.G_), + static_cast(param.K_)}; + + physical_lengths.insert(physical_lengths.end(), + param.output_spatial_lengths_.begin(), + param.output_spatial_lengths_.begin() + param.num_dim_spatial_); + } else if constexpr(ck::is_same_v || ck::is_same_v || ck::is_same_v) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt index f730534c8d..8d67b46fbf 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt @@ -8,6 +8,8 @@ set(GROUPED_CONV2D_BWD_WEIGHT xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp + xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp ) if(DL_KERNELS) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp new file mode 100644 index 0000000000..bbab53d9b5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp new file mode 100644 index 0000000000..b3b5489930 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances< + 2, + NGCHW, + GKYXC, + NGKHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt index 8e939c15a9..7857bb0293 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt @@ -8,6 +8,8 @@ set(GROUPED_CONV3D_BWD_WEIGHT xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp + xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp ) if(DL_KERNELS) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp new file mode 100644 index 0000000000..489fa81a7f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v2>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp new file mode 100644 index 0000000000..f9eadcec5b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances( + std::vector>>& instances) +{ + // 1. Default + add_device_operation_instances( + instances, + device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances< + 3, + NGCDHW, + GKZYXC, + NGKDHW, + ConvBwdWeightDefault, + BlockGemmPipelineScheduler::Intrawave, + BlockGemmPipelineVersion::v5>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index 7dd75a5e0a..732c9fae29 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -16,6 +16,7 @@ enum struct ConvLayout GNCHW_GKCYX_GNKHW, // 0 GNHWC_GKYXC_GNHWK, // 1 NHWGC_GKYXC_NHWGK, // 2 + NGCHW_GKYXC_NGKHW, // 3 }; enum struct ConvDataType @@ -178,6 +179,13 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); } } + else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + } if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) { if(data_type == ConvDataType::F32_F32_F32) @@ -224,6 +232,13 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); } } + else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + } std::cout << "this data_type & layout is not implemented" << std::endl; diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py index 06c656c8e5..d468c62d60 100644 --- a/script/convert_miopen_driver_to_profiler.py +++ b/script/convert_miopen_driver_to_profiler.py @@ -23,6 +23,26 @@ def run_ck_profiler_cmd(cmd): subprocess.run(cmd) +def parse_layouts(args): + if args.in_layout == "NCW" or args.in_layout == "NCHW" or \ + args.in_layout == "NCDHW": + if args.ck_profier_op == "grouped_conv_bwd_weight": + args.layout = 3 + else: + print('Not supported layout for this op') + exit(1) + elif args.in_layout == "NWC" or args.in_layout == "NHWC" or \ + args.in_layout == "NDHWC": + if args.ck_profier_op == "grouped_conv_bwd_weight": + args.layout = 2 + elif args.ck_profier_op == "grouped_conv_bwd_data" or \ + args.ck_profier_op == "grouped_conv_fwd": + args.layout = 1 + else: + print('Not supported layout for this op') + exit(1) + + def parse_data_type(args): if args.data_type == "fp32": if args.ck_profier_op == "grouped_conv_bwd_weight" or \ @@ -79,8 +99,7 @@ def add_conv_params_to_cmd(args, cmd): def run_ck_grouped_conv_fwd(args): args.ck_profier_op = "grouped_conv_fwd" parse_data_type(args) - # default for MIOpen NHWGC - args.layout = 1 + parse_layouts(args) # use int32 by default args.index_type = 0 @@ -99,8 +118,7 @@ def run_ck_grouped_conv_fwd(args): def run_ck_grouped_conv_bwd_data(args): args.ck_profier_op = "grouped_conv_bwd_data" parse_data_type(args) - # default for MIOpen NHWGC - args.layout = 1 + parse_layouts(args) cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)] cmd += [str(args.data_type), str(args.layout)] @@ -117,8 +135,7 @@ def run_ck_grouped_conv_bwd_data(args): def run_ck_grouped_conv_bwd_weight(args): args.ck_profier_op = "grouped_conv_bwd_weight" parse_data_type(args) - # default for MIOpen NHWGC - args.layout = 2 + parse_layouts(args) # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128} args.split_k_value = -1 @@ -181,8 +198,8 @@ if __name__ == "__main__": parser.add_argument( "-in_layout", "-I", - default=-1, - type=int, + default="NCHW", + type=str, required=False, help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)" ) diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp index aee80cb2cb..4220d6a0cd 100644 --- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp +++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp @@ -66,6 +66,12 @@ class TestGroupedConvndBwdWeight : public ::testing::Test { return true; } + // Skip due to the lack of kernels for NGCDHW + if constexpr(std::is_same_v || std::is_same_v || + std::is_same_v) + { + return true; + } } else { @@ -139,7 +145,8 @@ using KernelTypes2d = ::testing::Types< std::tuple>, std::tuple>, std::tuple>, - std::tuple>>; + std::tuple>, + std::tuple>>; using KernelTypes3d = ::testing::Types< std::tuple>, std::tuple>, @@ -148,7 +155,8 @@ using KernelTypes3d = ::testing::Types< std::tuple>, std::tuple>, std::tuple>, - std::tuple>>; + std::tuple>, + std::tuple>>; TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d); TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d); From 841009c5ee5a7913c0b19eb541ac39fea9ef6a4c Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 4 Sep 2024 07:36:27 -0700 Subject: [PATCH 3/5] Add an option to select an alternative python version during build. (#1496) * locate a newwer version of python when -DRHEL=ON flag is set * allow setting python version on cmake command line --- CMakeLists.txt | 14 +++++++++++++- .../gpu/mha/CMakeLists.txt | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 697eb3b745..bf42b6aa15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,19 @@ set(version 1.1.0) project(composable_kernel VERSION ${version} LANGUAGES CXX HIP) include(CTest) -find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) +if(NOT CK_USE_ALTERNATIVE_PYTHON) + find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) +else() + message("Using alternative python version") + set(EXTRA_PYTHON_PATH) + string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}") + message("alternative python path is: ${EXTRA_PYTHON_PATH}") + find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) + add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}") + set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}") + set(PYTHON_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}") + set(ENV{LD_LIBRARY_PATH} "${EXTRA_PYTHON_PATH}/lib:$ENV{LD_LIBRARY_PATH}") +endif() list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index 59ae09b739..8d159662a2 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -2,7 +2,19 @@ set(FMHA_CPP_FOLDER ${CMAKE_CURRENT_BINARY_DIR}) set(FMHA_SRC_FOLDER ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/) set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) # python stuff -find_package(PythonInterp 3 REQUIRED) +if(NOT CK_USE_ALTERNATIVE_PYTHON) + find_package(PythonInterp 3 REQUIRED) +else() + message("Using alternative python version") + set(EXTRA_PYTHON_PATH) + string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}") + message("alternative python path is: ${EXTRA_PYTHON_PATH}") + find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) + add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}") + set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}") + set(PYTHON_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}") + set(ENV{LD_LIBRARY_PATH} "${EXTRA_PYTHON_PATH}/lib:$ENV{LD_LIBRARY_PATH}") +endif() rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile) From 8b95d9ad523c426dd3c6ada1f5cc121689278f06 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 4 Sep 2024 07:36:41 -0700 Subject: [PATCH 4/5] copy all fmha headers when building library (#1497) * copy all fmha headers when building library * fix the rocm_install call for mha headers --- .../gpu/mha/CMakeLists.txt | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index 8d159662a2..3499779c2a 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -18,17 +18,10 @@ endif() rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile) -rocm_install(FILES - "${FMHA_SRC_FOLDER}/fmha_fwd.hpp" - "${FMHA_SRC_FOLDER}/bias.hpp" - "${FMHA_SRC_FOLDER}/mask.hpp" - DESTINATION include/ck_tile/ops -) - -# header for building lib -file(COPY ${FMHA_SRC_FOLDER}/fmha_fwd.hpp DESTINATION ${FMHA_CPP_FOLDER}) -file(COPY ${FMHA_SRC_FOLDER}/bias.hpp DESTINATION ${FMHA_CPP_FOLDER}) -file(COPY ${FMHA_SRC_FOLDER}/mask.hpp DESTINATION ${FMHA_CPP_FOLDER}) +file(GLOB MHA_HEADERS "${FMHA_SRC_FOLDER}/*.hpp") +rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops) +# headers for building lib +file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER}) # generate a list of kernels, but not actually emit files at config stage execute_process( From 52410b49c79e9c3f3d5fd8b51f659be325cbb1ae Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:36:57 -0500 Subject: [PATCH 5/5] Temporarily disable flaky test for all (#1495) --- test/data_type/CMakeLists.txt | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index 3fe0b211a8..95f1367fbf 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -1,12 +1,6 @@ -if (GPU_TARGETS) - if (NOT GPU_TARGETS MATCHES "gfx94") - add_definitions(-DCK_SKIP_FLAKY_F8_TEST) - set(CK_SKIP_FLAKY_F8_TEST "ON") - endif() -else() - add_definitions(-DCK_SKIP_FLAKY_F8_TEST) - set(CK_SKIP_FLAKY_F8_TEST "ON") -endif() +# temporarily disable flaky test for all architectures +add_definitions(-DCK_SKIP_FLAKY_F8_TEST) +set(CK_SKIP_FLAKY_F8_TEST "ON") if (USE_BITINT_EXTENSION_INT4) add_gtest_executable(test_int4 test_int4.cpp)