From d5bbd4c3f195bc41215b464938f070a19ecd403c Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Fri, 30 Jan 2026 01:58:18 -0500 Subject: [PATCH] [WIP] initial implementation --- ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 39 +- .../transform_conv_bwd_data_to_gemm_v1.hpp | 326 ++++++++++++++--- ...ice_grouped_conv_bwd_data_xdl_instance.hpp | 37 +- .../gpu/grouped_convolution_backward_data.hpp | 2 + .../grouped_convolution_backward_data_xdl.inc | 14 + .../grouped_conv2d_bwd_data/CMakeLists.txt | 1 + ...c_gkyxc_gnhwk_f16_group_merge_instance.cpp | 40 +++ .../profile_grouped_conv_bwd_data_impl.hpp | 6 +- profiler/src/CMakeLists.txt | 334 +++++++++--------- 9 files changed, 560 insertions(+), 239 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_group_merge_instance.cpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 16b12cf386..f2abb48ede 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -287,7 +287,8 @@ template + index_t MaxTransposeTransferOutScalarPerVector = 1, + index_t NumGroupsToMerge = 1> struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 : public DeviceGroupedConvBwdDataMultipleD; @@ -964,10 +965,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 math::integer_divide_ceil(gemms_count_, MaxGroupedGemmGroupsNum)); gemms_grid_size_.push_back(grid_size); - // A/B/Ds/E Batch Stride - compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0]; - compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides_transposed[0]; - compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides_transposed[0]; + // A/B/Ds/E Batch Stride (multiply by NumGroupsToMerge for group merging) + compute_ptr_offset_of_batch_.BatchStrideA_ = + a_g_n_k_wos_strides_transposed[0] * NumGroupsToMerge; + compute_ptr_offset_of_batch_.BatchStrideB_ = + b_g_k_c_xs_strides_transposed[0] * NumGroupsToMerge; + compute_ptr_offset_of_batch_.BatchStrideE_ = + e_g_n_c_wis_strides_transposed[0] * NumGroupsToMerge; compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[1] * conv_N_per_block_; @@ -1147,7 +1151,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 { float ave_time = 0; - const index_t gdy = arg.num_group_; + const index_t gdy = arg.num_group_ / NumGroupsToMerge; const index_t gdz = arg.num_workgroups_per_Conv_N_ * arg.k_batch_; const ADataType* p_a_grid = arg.p_a_grid_; @@ -1542,6 +1546,21 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 arg.e_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); const index_t input_spatial_acum = ck::accumulate_n( arg.a_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); + + // Validation: Check that NumGroupsToMerge divides ConvG evenly + if constexpr(NumGroupsToMerge > 1) + { + if(ConvG % NumGroupsToMerge != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Unsupported! Conv_G % NumGroupsToMerge != 0: Conv_G=" << ConvG + << ", NumGroupsToMerge=" << NumGroupsToMerge << std::endl; + } + return false; + } + } + // Specifialization if constexpr(ConvBackwardDataSpecialization == ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0) @@ -1908,7 +1927,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 << TransposeTransferInScalarPerVectorAligned <<", " << "TransposeTransferOutScalarPerVectorAligned: " << TransposeTransferOutScalarPerVectorAligned; } - + + if constexpr(NumGroupsToMerge > 1) + { + str << ", NumGroupsToMerge: " << NumGroupsToMerge; + } str << ">"; diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp index 03dc0efeb5..bb388951fc 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp @@ -508,9 +508,19 @@ struct TransformConvBwdDataToGemm_v1 ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - - return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), - make_tuple(WoStride_, KStrideTensorA_)); + if constexpr(NumGroupsToMerge == 1) + { + return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), + make_tuple(WoStride_, KStrideTensorA_)); + } + else + { + // Add NumGroupsToMerge dimension for group merging + const index_t BatchStride = NStrideTensorA_; + return make_naive_tensor_descriptor( + make_tuple(N_ * Ho_ * Wo_, NumGroupsToMerge, K_), + make_tuple(WoStride_, BatchStride, KStrideTensorA_)); + } } else { @@ -607,14 +617,105 @@ struct TransformConvBwdDataToGemm_v1 __host__ __device__ auto MakeWeiGridDesc() const { + // Power-of-2 constraint for NumGroupsToMerge (required for XOR transform) + static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 || + NumGroupsToMerge == 8 || NumGroupsToMerge == 16 || NumGroupsToMerge == 32 || + NumGroupsToMerge == 64); if constexpr(is_same_v) { - return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_)); + if constexpr(NumGroupsToMerge == 1) + { + // Original V1 logic - no group merging + return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_)); + } + else + { + // Group merging logic - XOR + merge pattern from bwd_weight V2 + // Add NumGroupsToMerge for M dimension and 1 as placeholder for N dimension + constexpr auto NumGroupsToMergeNumber = Number{}; + const auto desc = make_naive_tensor_descriptor_packed( + make_tuple(NumGroupsToMergeNumber, K_, Y_ * X_, I1, C_)); + + // Pad placeholder dimension from 1 to NumGroupsToMerge + const auto padded_desc = transform_tensor_descriptor( + desc, + make_tuple(make_pass_through_transform(NumGroupsToMergeNumber), + make_pass_through_transform(K_), + make_pass_through_transform(Y_ * X_), + make_pad_transform(I1, I0, NumGroupsToMergeNumber - I1), + make_pass_through_transform(C_)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + // XOR transform to select diagonal (where group indices match) + const auto unmerged_padded_desc = transform_tensor_descriptor( + padded_desc, + make_tuple(make_xor_transform( + make_tuple(NumGroupsToMergeNumber, NumGroupsToMergeNumber)), + make_pass_through_transform(K_), + make_pass_through_transform(Y_ * X_), + make_pass_through_transform(C_)), + make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}), + make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{})); + + // Merge to create M and N dimensions for GEMM + return transform_tensor_descriptor( + unmerged_padded_desc, + make_tuple( + make_merge_transform(make_tuple(K_, NumGroupsToMergeNumber)), // M dimension + make_merge_transform( + make_tuple(Y_ * X_, NumGroupsToMergeNumber, C_))), // N dimension + make_tuple(Sequence<1, 0>{}, Sequence<2, 3, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } } else if constexpr(is_same_v) { - return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_)); + if constexpr(NumGroupsToMerge == 1) + { + // Original V1 logic - no group merging + return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_)); + } + else + { + // Group merging logic for 3D - XOR + merge pattern from bwd_weight V2 + constexpr auto NumGroupsToMergeNumber = Number{}; + const auto desc = make_naive_tensor_descriptor_packed( + make_tuple(NumGroupsToMergeNumber, K_, Z_ * Y_ * X_, I1, C_)); + + const auto padded_desc = transform_tensor_descriptor( + desc, + make_tuple(make_pass_through_transform(NumGroupsToMergeNumber), + make_pass_through_transform(K_), + make_pass_through_transform(Z_ * Y_ * X_), + make_pad_transform(I1, I0, NumGroupsToMergeNumber - I1), + make_pass_through_transform(C_)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto unmerged_padded_desc = transform_tensor_descriptor( + padded_desc, + make_tuple(make_xor_transform( + make_tuple(NumGroupsToMergeNumber, NumGroupsToMergeNumber)), + make_pass_through_transform(K_), + make_pass_through_transform(Z_ * Y_ * X_), + make_pass_through_transform(C_)), + make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}), + make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{})); + + return transform_tensor_descriptor( + unmerged_padded_desc, + make_tuple( + make_merge_transform(make_tuple(K_, NumGroupsToMergeNumber)), + make_merge_transform(make_tuple(Z_ * Y_ * X_, NumGroupsToMergeNumber, C_))), + make_tuple(Sequence<1, 0>{}, Sequence<2, 3, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } } else { @@ -670,20 +771,49 @@ struct TransformConvBwdDataToGemm_v1 math::integer_divide_ceil(K_, AK1 * K0PerBlock * batch_k_) * K0PerBlock; // A: output tensor - const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor( - out_grid_desc, - make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_), - make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<1>{}, Sequence<0, 2>{})); + if constexpr(NumGroupsToMerge == 1) + { + const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor( + out_grid_desc, + make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_), + make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<1>{}, Sequence<0, 2>{})); - const auto out_gemmak0_gemmm_gemmak1_grid_desc = - ck::tensor_operation::device::PadTensorDescriptor( - out_gemmak0_gemmmraw_gemmak1_grid_desc, - make_tuple(AK0 * batch_k_, GemmMPerBlock, AK1), - Sequence{}); + const auto out_gemmak0_gemmm_gemmak1_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + out_gemmak0_gemmmraw_gemmak1_grid_desc, + make_tuple(AK0 * batch_k_, GemmMPerBlock, AK1), + Sequence{}); - return out_gemmak0_gemmm_gemmak1_grid_desc; + return out_gemmak0_gemmm_gemmak1_grid_desc; + } + else + { + // Group merging: out_grid_desc has (N * Ho * Wo, NumGroupsToMerge, K) + // Merge NumGroupsToMerge with K to form M dimension + const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor( + out_grid_desc, + make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_), + make_merge_transform(make_tuple(NumGroupsToMerge, K_))), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor( + out_gemmkpad_gemmm_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)), + make_pass_through_transform(NumGroupsToMerge * K_)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + const auto out_gemmak0_gemmm_gemmak1_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + out_gemmak0_gemmmraw_gemmak1_grid_desc, + make_tuple(AK0 * batch_k_, GemmMPerBlock, AK1), + Sequence{}); + + return out_gemmak0_gemmm_gemmak1_grid_desc; + } } else { @@ -947,26 +1077,54 @@ struct TransformConvBwdDataToGemm_v1 ck::tensor_operation::device::ConvolutionBackwardDataSpecialization:: Filter1x1Stride1Pad0) { - const index_t K0PerBlock = GemmKPerBlock / BK1; - const index_t BK0 = - math::integer_divide_ceil(K_, BK1 * K0PerBlock * batch_k_) * K0PerBlock; - // B: weight tensor - const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor( - make_naive_tensor_descriptor_packed(make_tuple(K_, C_)), - make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)), - make_pass_through_transform(C_)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1)); + if constexpr(NumGroupsToMerge == 1) + { + const index_t K0PerBlock = GemmKPerBlock / BK1; + const index_t BK0 = + math::integer_divide_ceil(K_, BK1 * K0PerBlock * batch_k_) * K0PerBlock; - const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = - ck::tensor_operation::device::PadTensorDescriptor( - wei_gemmbk0_gemmnraw_gemmbk1_grid_desc, - make_tuple(BK0 * batch_k_, GemmNPerBlock, BK1), - Sequence{}); + const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor( + make_naive_tensor_descriptor_packed(make_tuple(K_, C_)), + make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)), + make_pass_through_transform(C_)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - return wei_gemmbk0_gemmn_gemmbk1_grid_desc; + const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + wei_gemmbk0_gemmnraw_gemmbk1_grid_desc, + make_tuple(BK0 * batch_k_, GemmNPerBlock, BK1), + Sequence{}); + + return wei_gemmbk0_gemmn_gemmbk1_grid_desc; + } + else + { + // Group merging: wei_grid_desc already has merged M, N from MakeWeiGridDesc() + // It returns (K * NumGroupsToMerge, Y * X * NumGroupsToMerge * C) + const auto wei_grid_desc = MakeWeiGridDesc(); + + const index_t K0PerBlock = GemmKPerBlock / BK1; + const index_t BK0 = + math::integer_divide_ceil(K_ * NumGroupsToMerge, BK1 * K0PerBlock * batch_k_) * + K0PerBlock; + + const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor( + wei_grid_desc, + make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)), + make_pass_through_transform(Y_ * X_ * NumGroupsToMerge * C_)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + wei_gemmbk0_gemmnraw_gemmbk1_grid_desc, + make_tuple(BK0 * batch_k_, GemmNPerBlock, BK1), + Sequence{}); + + return wei_gemmbk0_gemmn_gemmbk1_grid_desc; + } } else { @@ -1161,32 +1319,86 @@ struct TransformConvBwdDataToGemm_v1 // C: input tensor if constexpr(NDimSpatial == 2) { - const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( - in_grid_desc, - make_tuple( - make_pass_through_transform(N_), - make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), - make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), - make_pass_through_transform(C_)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); + if constexpr(NumGroupsToMerge == 1) + { + const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor( + in_grid_desc, + make_tuple( + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(C_)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple( + Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( - in_n_y_ho_x_wo_c_grid_desc, - make_tuple(make_freeze_transform(I0), - make_freeze_transform(I0), - make_merge_transform(make_tuple(N_, Ho_, Wo_)), - make_pass_through_transform(C_)), - make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}), - make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{})); + const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( + in_n_y_ho_x_wo_c_grid_desc, + make_tuple(make_freeze_transform(I0), + make_freeze_transform(I0), + make_merge_transform(make_tuple(N_, Ho_, Wo_)), + make_pass_through_transform(C_)), + make_tuple( + Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}), + make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{})); - const auto in_gemmm_gemmn_grid_desc = - ck::tensor_operation::device::PadTensorDescriptor( - in_gemmmraw_gemmnraw_grid_desc, - make_tuple(GemmMPerBlock, GemmNPerBlock), - Sequence{}); + const auto in_gemmm_gemmn_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + in_gemmmraw_gemmnraw_grid_desc, + make_tuple(GemmMPerBlock, GemmNPerBlock), + Sequence{}); - return in_gemmm_gemmn_grid_desc; + return in_gemmm_gemmn_grid_desc; + } + else + { + // Group merging: Add NumGroupsToMerge dimension and merge with C + // in_grid_desc is (N, Hi, Wi, C) from MakeInGridDesc() + const index_t BatchStride = NStrideTensorC_; + + // Create descriptor with NumGroupsToMerge dimension + const auto in_n_hi_wi_numgroups_c_grid_desc = make_naive_tensor_descriptor( + make_tuple(N_, Hi_, Wi_, NumGroupsToMerge, C_), + make_tuple( + NStrideTensorC_, HiStride_, WiStride_, BatchStride, CStrideTensorC_)); + + const auto in_n_y_ho_x_wo_numgroups_c_grid_desc = transform_tensor_descriptor( + in_n_hi_wi_numgroups_c_grid_desc, + make_tuple( + make_pass_through_transform(N_), + make_embed_transform(make_tuple(I1, Ho_), make_tuple(I1, ConvStrideH_)), + make_embed_transform(make_tuple(I1, Wo_), make_tuple(I1, ConvStrideW_)), + make_pass_through_transform(NumGroupsToMerge), + make_pass_through_transform(C_)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}), + make_tuple(Sequence<0>{}, + Sequence<1, 2>{}, + Sequence<3, 4>{}, + Sequence<5>{}, + Sequence<6>{})); + + const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor( + in_n_y_ho_x_wo_numgroups_c_grid_desc, + make_tuple(make_freeze_transform(I0), + make_freeze_transform(I0), + make_merge_transform(make_tuple(N_, Ho_, Wo_)), + make_merge_transform(make_tuple(NumGroupsToMerge, C_))), + make_tuple( + Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5, 6>{}), + make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{})); + + const auto in_gemmm_gemmn_grid_desc = + ck::tensor_operation::device::PadTensorDescriptor( + in_gemmmraw_gemmnraw_grid_desc, + make_tuple(GemmMPerBlock, GemmNPerBlock), + Sequence{}); + + return in_gemmm_gemmn_grid_desc; + } } else if constexpr(NDimSpatial == 3) { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp index 970bcb0439..8e53d03a03 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp @@ -68,12 +68,37 @@ using device_grouped_conv_bwd_data_xdl_f16_16_16_instances = // ##############################################| Spatial| | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector| // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4>, - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4>, - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4>, - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4> + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 1>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, LoopScheduler::Default, F16, F16, 1, 1, 1>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 16, 64, 32, 8, 8, 16, 16, 1, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 1>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 1>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, LoopScheduler::Default, F16, F16, 1, 1, 1>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 1> + // clang-format on + >; + +// Dedicated tuple for NumGroupsToMerge > 1 testing (Filter1x1Stride1Pad0 only) +template +using device_grouped_conv_bwd_data_xdl_f16_16_16_group_merge_instances = + std::tuple< + // clang-format off + // ##############################################| NDim| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl| CDEBlockTransfer| CDEBlockTransfer| + // ##############################################| Spatial| | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector| + // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| + // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // NumGroupsToMerge = 2 test instances + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 2>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, LoopScheduler::Default, F16, F16, 1, 1, 2>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 2>, + // NumGroupsToMerge = 4 test instances + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 4>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, LoopScheduler::Default, F16, F16, 1, 1, 4>, + DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 64, 64, 16, 32, 8, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 4, LoopScheduler::Default, F16, F16, 1, 1, 4> // clang-format on >; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp index f784b6ea51..07a2518fb6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp @@ -80,6 +80,8 @@ struct DeviceOperationInstanceFactory< is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(op_ptrs); + add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_group_merge_instances( + op_ptrs); } #endif #ifdef CK_ENABLE_FP32 diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc index 7c61f3ee66..e9d4069935 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc @@ -23,6 +23,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances( PassThrough, PassThrough>>>& instances); +void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_group_merge_instances( + std::vector>>& instances); + #endif #ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances( diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt index 19e27cf173..20f96bcab8 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt @@ -6,6 +6,7 @@ add_instance_library( device_grouped_conv2d_bwd_data_instance xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp + xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_group_merge_instance.cpp xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_group_merge_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_group_merge_instance.cpp new file mode 100644 index 0000000000..73611c28ea --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_group_merge_instance.cpp @@ -0,0 +1,40 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// Test instances for NumGroupsToMerge > 1 (Filter1x1Stride1Pad0 only) +void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_group_merge_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv_bwd_data_xdl_f16_16_16_group_merge_instances< + 2, + GNHWK, + GKYXC, + Empty_Tuple, + GNHWC, + ConvBwdDataFilter1x1Stride1Pad0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index eceb70c05f..f4bff16a08 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -78,6 +78,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size); DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size); + if(do_verification > 0) + do_verification = 2; + // Initialize tensors based on do_verification: // - do_verification=2: GPU-side initialization // - do_verification=0,1: CPU-side initialization @@ -410,7 +413,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); - std::vector split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; + // std::vector split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; + std::vector split_k_list = {1, 2, 4, 8}; if(split_k > 0) { diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 012d6e1502..4c03f1539a 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -14,118 +14,118 @@ message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}") message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}") set(PROFILER_OPS - profile_gemm.cpp - profile_reduce.cpp - profile_groupnorm_bwd_data.cpp - profile_groupnorm_fwd.cpp - profile_layernorm_bwd_data.cpp - profile_layernorm_bwd_gamma_beta.cpp - profile_groupnorm_bwd_gamma_beta.cpp - profile_layernorm_fwd.cpp - profile_max_pool2d_fwd.cpp - profile_pool3d_fwd.cpp - profile_avg_pool3d_bwd.cpp - profile_max_pool3d_bwd.cpp - profile_avg_pool2d_bwd.cpp - profile_max_pool2d_bwd.cpp - profile_softmax.cpp - profile_batchnorm_fwd.cpp - profile_batchnorm_bwd.cpp - profile_batchnorm_infer.cpp - profile_conv_tensor_rearrange.cpp - profile_transpose.cpp - profile_permute_scale.cpp - profile_gemm_quantization.cpp + # profile_gemm.cpp + # profile_reduce.cpp + # profile_groupnorm_bwd_data.cpp + # profile_groupnorm_fwd.cpp + # profile_layernorm_bwd_data.cpp + # profile_layernorm_bwd_gamma_beta.cpp + # profile_groupnorm_bwd_gamma_beta.cpp + # profile_layernorm_fwd.cpp + # profile_max_pool2d_fwd.cpp + # profile_pool3d_fwd.cpp + # profile_avg_pool3d_bwd.cpp + # profile_max_pool3d_bwd.cpp + # profile_avg_pool2d_bwd.cpp + # profile_max_pool2d_bwd.cpp + # profile_softmax.cpp + # profile_batchnorm_fwd.cpp + # profile_batchnorm_bwd.cpp + # profile_batchnorm_infer.cpp + # profile_conv_tensor_rearrange.cpp + # profile_transpose.cpp + # profile_permute_scale.cpp + # profile_gemm_quantization.cpp ) if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") - if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp) - list(APPEND PROFILER_OPS profile_contraction_scale.cpp) - endif() - if(CK_EXPERIMENTAL_BUILDER) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp) - endif() + # if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) + # list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp) + # list(APPEND PROFILER_OPS profile_contraction_scale.cpp) + # endif() + # if(CK_EXPERIMENTAL_BUILDER) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp) + # endif() endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_OPS profile_gemm_reduce.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm.cpp) - list(APPEND PROFILER_OPS profile_gemm_streamk.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp) + # list(APPEND PROFILER_OPS profile_gemm_reduce.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add.cpp) + # list(APPEND PROFILER_OPS profile_grouped_gemm.cpp) + # list(APPEND PROFILER_OPS profile_gemm_streamk.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp) + # list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp) + # list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp) + # list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp) + # list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") - list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp) - list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp) + # list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp) + # list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") - list(APPEND PROFILER_OPS profile_gemm_mx.cpp) + # list(APPEND PROFILER_OPS profile_gemm_mx.cpp) endif() - list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) - list(APPEND PROFILER_OPS profile_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp) - list(APPEND PROFILER_OPS profile_gemm_splitk.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add.cpp) + # list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp) + # list(APPEND PROFILER_OPS profile_gemm_splitk.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp) + # list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp) + # list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp) list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd.cpp) + # list(APPEND PROFILER_OPS profile_conv_fwd.cpp) endif() if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")) - list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp) + # list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") - list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - list(APPEND PROFILER_OPS profile_gemm_universal.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_bnorm_clamp.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp) + # list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_bnorm_clamp.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp) - list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) - list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp) - list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp) + # list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) endif() - list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp) endif() if(DL_KERNELS) - list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp) - list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp) + # list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) endif() if(CK_ENABLE_INT8) - list(APPEND PROFILER_OPS profile_gemm_quantization.cpp) + # list(APPEND PROFILER_OPS profile_gemm_quantization.cpp) endif() set(PROFILER_SOURCES profiler.cpp) @@ -152,131 +152,131 @@ endif() set(DEVICE_INSTANCES "") -list(APPEND DEVICE_INSTANCES device_gemm_instance) -list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance) -list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance) -list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance) -list(APPEND DEVICE_INSTANCES device_softmax_instance) -list(APPEND DEVICE_INSTANCES device_reduce_instance) -list(APPEND DEVICE_INSTANCES device_batchnorm_instance) -list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance) -list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance) -list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance) -list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance) -list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance) -list(APPEND DEVICE_INSTANCES device_image_to_column_instance) -list(APPEND DEVICE_INSTANCES device_column_to_image_instance) -list(APPEND DEVICE_INSTANCES device_transpose_instance) -list(APPEND DEVICE_INSTANCES device_permute_scale_instance) +# list(APPEND DEVICE_INSTANCES device_gemm_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance) +# list(APPEND DEVICE_INSTANCES device_softmax_instance) +# list(APPEND DEVICE_INSTANCES device_reduce_instance) +# list(APPEND DEVICE_INSTANCES device_batchnorm_instance) +# list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_image_to_column_instance) +# list(APPEND DEVICE_INSTANCES device_column_to_image_instance) +# list(APPEND DEVICE_INSTANCES device_transpose_instance) +# list(APPEND DEVICE_INSTANCES device_permute_scale_instance) if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance) - list(APPEND DEVICE_INSTANCES device_contraction_scale_instance) + # list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance) + # list(APPEND DEVICE_INSTANCES device_contraction_scale_instance) endif() if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance) endif() - list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance) if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") - list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]") - list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance) - list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") - list(APPEND DEVICE_INSTANCES device_gemm_mx_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_mx_instance) endif() - list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance) - list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance) - list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance) - list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance) + # list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance) + # list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance) list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance) endif() if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" )) - list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") - list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - list(APPEND DEVICE_INSTANCES device_gemm_universal_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scale_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_bnorm_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scale_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_bnorm_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) - list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) endif() - list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(CK_EXPERIMENTAL_BUILDER) - list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances) + # list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances) endif() endif() if(DL_KERNELS) - list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) - list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) endif() if(CK_ENABLE_INT8) - list(APPEND DEVICE_INSTANCES device_quantization_instance) + # list(APPEND DEVICE_INSTANCES device_quantization_instance) endif() set(PROFILER_LIBS utility getopt::getopt)