diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index 36dcf58d70..ff1282f3c7 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. /* Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o] @@ -60,14 +60,14 @@ struct AddAddRelu { const ck::half_t x = c + d0 + d1; - ck::tensor_operation::element_wise::Relu{}.template operator()(e, x); + ck::tensor_operation::element_wise::Relu{}.operator()(e, x); } __host__ __device__ void operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const { const float x = c + (d0 + d1); - ck::tensor_operation::element_wise::Relu{}.template operator()(e, x); + ck::tensor_operation::element_wise::Relu{}.operator()(e, x); } }; diff --git a/example/62_convnd_activ/CMakeLists.txt b/example/62_convnd_activ/CMakeLists.txt index ab136d99ba..79fafed4eb 100644 --- a/example/62_convnd_activ/CMakeLists.txt +++ b/example/62_convnd_activ/CMakeLists.txt @@ -6,6 +6,7 @@ add_subdirectory(convscale_add) add_subdirectory(convscale_reduce) add_subdirectory(multi_AB) add_subdirectory(unary) +add_subdirectory(dynamic_unary) add_custom_target(example_convnd_activ_xdl) # ScaleAdd ScaleAdd Relu diff --git a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt new file mode 100644 index 0000000000..23f07439a5 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt @@ -0,0 +1,45 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + add_custom_target(example_convnd_activ_dynamic_unary_xdl) + # Sigmoid + add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16) + # Tanh + add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16) + # Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16) + # SoftRelu + add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16) + # Abs + add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16) + # Pow + add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16) + # Clipped Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16) + # Leaky Relu + add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16) + # Elu + add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16) + # Swish + add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16) + # PassThrough + add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16) + # Logistic + add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp) + add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16) + + set(target 1) + endif() +endforeach() diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp new file mode 100644 index 0000000000..ed31be19ee --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +constexpr ck::index_t NDimSpatial = 3; +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using OutDataType = ck::half_t; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::GNDHWC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::GNDHWK; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using DynamicElementOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +using DeviceGroupedConvNDActivInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple<>, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + DynamicElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +template +bool run_grouped_conv(bool do_verification, + int init_method, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const HostTensorDescriptor& in_g_n_c_wis_desc, + const HostTensorDescriptor& wei_g_k_c_xs_desc, + const HostTensorDescriptor& out_g_n_k_wos_desc, + const InElementOp& in_element_op, + const WeiElementOp& wei_element_op, + const OutElementOp& out_element_op) +{ + + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + wei.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + // do Conv + auto conv = DeviceConvNDFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + std::array{}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, 0>{{}}, + std::array, 0>{{}}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error("The device op with the specified compilation parameters does " + "not support this convolution problem."); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(do_verification) + { + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + out_element_op); + + ref_invoker.Run(ref_argument); + + out_device_buf.FromDevice(out_device.mData.data()); + + return ck::utils::check_err(out_device, out_host, "Error: incorrect results!"); + } + + return true; +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp new file mode 100644 index 0000000000..8fa455c62e --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::UnaryAbs out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp new file mode 100644 index 0000000000..239a21525b --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::ClippedRelu out_element_op(0.f, 1.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp new file mode 100644 index 0000000000..23a094af70 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Elu out_element_op(2.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp new file mode 100644 index 0000000000..fe4b80a681 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::LeakyRelu out_element_op(0.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp new file mode 100644 index 0000000000..756c07ed85 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Logistic out_element_op(1.0f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp new file mode 100644 index 0000000000..6588ec5044 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::PassThrough out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp new file mode 100644 index 0000000000..90f00a166a --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Power out_element_op(4.f, 1.f, 2.f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp new file mode 100644 index 0000000000..830297cb56 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Relu out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp new file mode 100644 index 0000000000..b143b4a4eb --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Sigmoid out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp new file mode 100644 index 0000000000..83ba0f7f8c --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::SoftRelu out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp new file mode 100644 index 0000000000..e862d1120a --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::Swish out_element_op(1.0f); + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp new file mode 100644 index 0000000000..a91fc7ce30 --- /dev/null +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_activ_dynamic_unary_common.hpp" + +#include "../run_convnd_activ_dynamic_example.inc" + +int main(int argc, char* argv[]) +{ + + ck::tensor_operation::element_wise::TanH out_element_op; + return !run_convnd_example(argc, argv, out_element_op); +} diff --git a/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc new file mode 100644 index 0000000000..4e90cf9366 --- /dev/null +++ b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +void print_helper_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +template +bool run_convnd_example(int argc, char* argv[], const OutElementOp& out_element_op) +{ + print_helper_msg(); + + bool do_verification = true; + // Use floats for SoftRelu by default to avoid overflow after e^x. + int init_method = + std::is_same_v ? 2 : 1; + bool time_kernel = false; + + // Following shapes are selected to avoid overflow. Expect inf in case of + // size increase for some elementwise ops. + ck::utils::conv::ConvParam conv_param{ + 3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + + if(argc == 1) + { + // use default + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + + conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); + } + + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + + const auto run = [&]() { + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( + conv_param); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed( + conv_param); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed( + conv_param); + + return run_grouped_conv(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); + }; + + if(conv_param.num_dim_spatial_ == 3) + { + return run(); + } + + return false; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 939ee17297..f21a45938f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -85,9 +85,9 @@ __global__ void BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType* __restrict__ p_e_grid, - const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, - const CDEElementwiseOperation cde_element_op, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock @@ -121,6 +121,19 @@ __global__ void static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; }); + if constexpr(is_same_v) + { + a_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(is_same_v) + { + b_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(is_same_v) + { + cde_element_op.InitUnaryOpPtrOnDevice(); + } + if constexpr(isMultiA || isMultiB) { AsPointer p_as_grid_grp; diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 135eaec93e..b914c0b96f 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -405,7 +405,7 @@ struct ScaleAddScaleAddRelu const float& d1) const { const float x = c * alpha1_ + alpha2_ * d0 + d1; - Relu{}.template operator()(e, x); + e = x > 0 ? x : 0; } template <> @@ -416,7 +416,7 @@ struct ScaleAddScaleAddRelu type_convert(d1); float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } @@ -429,7 +429,7 @@ struct ScaleAddScaleAddRelu type_convert(d1); float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } @@ -441,7 +441,7 @@ struct ScaleAddScaleAddRelu const float x = type_convert(c) * alpha1_ + alpha2_ * d0 + d1; float result = 0; - Relu{}.template operator()(result, x); + result = x > 0 ? x : 0; e = type_convert(result); } diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index ab6b1691af..712b886183 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -7,11 +7,36 @@ #include "ck/utility/math.hpp" #include "ck/utility/math_v2.hpp" #include "ck/utility/type_convert.hpp" +#include namespace ck { namespace tensor_operation { namespace element_wise { +struct UnaryOpBase +{ + public: + __host__ __device__ virtual ~UnaryOpBase() = default; + + __host__ __device__ UnaryOpBase() = default; + __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default; + __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default; + __host__ __device__ UnaryOpBase(UnaryOpBase&&) = default; + __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default; + + __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0; + + __host__ __device__ virtual inline void operator()(double& y, const double& x) const = 0; + + __host__ __device__ virtual inline void operator()(int32_t& y, const int32_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(int8_t& y, const int8_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(half_t& y, const half_t& x) const = 0; + + __host__ __device__ virtual inline void operator()(bhalf_t& y, const bhalf_t& x) const = 0; +}; + struct PassThroughPack2 { template @@ -25,17 +50,24 @@ struct PassThroughPack2 constexpr const static bool is_pack2_invocable = true; }; -struct PassThrough +struct PassThrough : public UnaryOpBase { + + __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; } + + __host__ __device__ inline void operator()(double& y, const double& x) const final { y = x; } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final { y = x; } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { y = x; } + template __host__ __device__ void operator()(Y& y, const X& x) const; - template <> - __host__ __device__ void operator()(double& y, const double& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(float& y, const double& x) const { @@ -48,36 +80,12 @@ struct PassThrough y = type_convert(x); } - template <> - __host__ __device__ void operator()(float& y, const float& x) const - { - y = x; - } - - template <> - __host__ __device__ void operator()(half_t& y, const half_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(half_t& y, const float& x) const { y = type_convert(x); } - template <> - __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const - { - y = x; - } - - template <> - __host__ __device__ void operator()(int32_t& y, const int32_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(bhalf_t& y, const float& x) const { @@ -102,12 +110,6 @@ struct PassThrough y = type_convert(x); } - template <> - __host__ __device__ void operator()(int8_t& y, const int8_t& x) const - { - y = x; - } - template <> __host__ __device__ void operator()(half_t& y, const int8_t& x) const { @@ -407,20 +409,38 @@ struct UnarySquare }; }; -struct UnaryAbs +struct UnaryAbs : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - y = ck::math::abs(x); - }; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = ck::math::abs(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + y = ck::math::abs(x); + } - template <> __host__ __device__ void operator()(f8_t& y, const f8_t& x) const { y = ck::type_convert(ck::math::abs(ck::type_convert(x))); @@ -439,20 +459,34 @@ struct UnarySqrt }; }; -struct Relu +struct Relu : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); y = x > 0 ? x : 0; } - template <> - __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = x > 0 ? x : 0; + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { float x_f32 = ck::type_convert(x); float y_f32 = x_f32 > 0 ? x_f32 : 0; @@ -599,18 +633,46 @@ struct Gelu } }; -struct Sigmoid +struct Sigmoid : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - constexpr T one = type_convert(1); - y = one / (one + ck::math::exp(-x)); - }; + constexpr float one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + constexpr double one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + constexpr int32_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + constexpr int8_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + constexpr half_t one = type_convert(1); + y = one / (one + ck::math::exp(-x)); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + constexpr float one = type_convert(1); + float x_f32 = ck::type_convert(x); + float y_f32 = one / (one + ck::math::exp(x_f32)); + y = ck::type_convert(y_f32); + } }; struct Silu @@ -626,18 +688,37 @@ struct Silu }; }; -struct TanH +struct TanH : public UnaryOpBase { - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ inline void operator()(float& y, const float& x) const final { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - y = ck::math::tanh(x); - }; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + y = ck::math::tanh(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + y = ck::math::tanh(x); + } }; struct ACos @@ -878,138 +959,393 @@ struct Rcp }; }; -struct Swish +struct Swish : public UnaryOpBase { - Swish(float beta = 1.0f) : beta_(beta) {} + __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {} + + __host__ __device__ float get_beta() const { return beta_; } + + const float beta_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + float bx = -beta_ * type_convert(x); + y = type_convert(x / (1.f + ck::math::exp(bx))); + } template __host__ __device__ void operator()(Y& y, const X& x) const { static_assert(is_same::value || is_same::value || - is_same::value, + is_same::value, "Data type is not supported by this operation!"); static_assert(is_same::value || is_same::value || - is_same::value, + is_same::value, "Data type is not supported by this operation!"); float bx = -beta_ * type_convert(x); y = type_convert(x / (1.f + ck::math::exp(bx))); - }; - - const float beta_; + } }; -struct SoftRelu +struct SoftRelu : public UnaryOpBase { - SoftRelu(float alpha = 1.f) : alpha_(alpha){}; + __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - constexpr T one = type_convert(1); - y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; - } const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + constexpr float one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + constexpr double one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + constexpr int32_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + constexpr int8_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + constexpr half_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + constexpr bhalf_t one = type_convert(1); + y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + } }; -struct Power +struct Power : public UnaryOpBase { - Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) - : alpha_(alpha), beta_(beta), gamma_(gamma){}; - - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) + : alpha_(alpha), beta_(beta), gamma_(gamma) { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - T casted_beta = type_convert(beta_); - T casted_gamma = type_convert(gamma_); - T shifted_scaled_x = casted_alpha + casted_beta * x; - y = ck::math::pow(shifted_scaled_x, casted_gamma); } + + __host__ __device__ float get_alpha() const { return alpha_; } + + __host__ __device__ float get_beta() const { return beta_; } + + __host__ __device__ float get_gamma() const { return gamma_; } + const float alpha_; const float beta_; const float gamma_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + float casted_beta = type_convert(beta_); + float casted_gamma = type_convert(gamma_); + + float shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + double casted_beta = type_convert(beta_); + double casted_gamma = type_convert(gamma_); + + double shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + int32_t casted_beta = type_convert(beta_); + int32_t casted_gamma = type_convert(gamma_); + + int32_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + int8_t casted_beta = type_convert(beta_); + int8_t casted_gamma = type_convert(gamma_); + + int8_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + half_t casted_beta = type_convert(beta_); + half_t casted_gamma = type_convert(gamma_); + + half_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + bhalf_t casted_beta = type_convert(beta_); + bhalf_t casted_gamma = type_convert(gamma_); + + bhalf_t shifted_scaled_x = casted_alpha + casted_beta * x; + y = ck::math::pow(shifted_scaled_x, casted_gamma); + } }; -struct ClippedRelu +struct ClippedRelu : public UnaryOpBase { - ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){}; - - template - __host__ __device__ void operator()(T& y, const T& x) const + __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f) + : alpha_(alpha), beta_(beta) { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - T casted_beta = type_convert(beta_); - y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); } + + __host__ __device__ float get_alpha() const { return alpha_; } + + __host__ __device__ float get_beta() const { return beta_; } + const float alpha_; const float beta_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + float casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + double casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + int32_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + int8_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + half_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + bhalf_t casted_beta = type_convert(beta_); + y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + } }; -struct LeakyRelu +struct LeakyRelu : public UnaryOpBase { - LeakyRelu(float alpha = 0.01f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - y = x >= 0 ? x : x * casted_alpha; - } + __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + y = x >= 0 ? x : x * casted_alpha; + } + + __host__ __device__ inline void operator()([[maybe_unused]] bhalf_t& y, + [[maybe_unused]] const bhalf_t& x) const final + { + } }; -struct Elu +struct Elu : public UnaryOpBase { - Elu(float alpha = 1.f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - y = x > 0 ? x : casted_alpha * ck::math::expm1(x); - } + __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + } }; -struct Logistic +struct Logistic : public UnaryOpBase { - Logistic(float alpha = 1.f) : alpha_(alpha){}; - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || - is_same::value, - "Data type is not supported by this operation!"); - T casted_alpha = type_convert(alpha_); - constexpr T one = type_convert(1); - y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); - } + __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {} + + __host__ __device__ float get_alpha() const { return alpha_; } + const float alpha_; + + __host__ __device__ inline void operator()(float& y, const float& x) const final + { + float casted_alpha = type_convert(alpha_); + constexpr float one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(double& y, const double& x) const final + { + double casted_alpha = type_convert(alpha_); + constexpr double one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final + { + int32_t casted_alpha = type_convert(alpha_); + constexpr int32_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final + { + int8_t casted_alpha = type_convert(alpha_); + constexpr int8_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final + { + half_t casted_alpha = type_convert(alpha_); + constexpr half_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } + + __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final + { + bhalf_t casted_alpha = type_convert(alpha_); + constexpr bhalf_t one = type_convert(1); + y = casted_alpha / (one + ck::math::exp(-x) * casted_alpha); + } }; struct ConvInvscale @@ -1074,7 +1410,7 @@ struct ConvScaleRelu __host__ __device__ void operator()(f8_t& e, const float& c) const { float x; - Relu{}.template operator()(x, c * scale_in_ * scale_wei_); + Relu{}(x, c * scale_in_ * scale_wei_); e = type_convert(x * scale_out_); }; @@ -1153,6 +1489,239 @@ struct FastNumericArrayConverter __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); } }; +struct DynamicUnaryOp +{ + + DynamicUnaryOp& operator=(const DynamicUnaryOp& other) + { + if(this != &other) + { + unary_op_ptr_ = other.unary_op_ptr_; + unary_op_type_ = other.unary_op_type_; + } + return *this; + } + + __host__ __device__ DynamicUnaryOp() = delete; + + __host__ __device__ DynamicUnaryOp(const Swish& swish) + { + unary_op_type_ = UnaryOpType::Swish; + beta = swish.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const Swish&& swish) + { + unary_op_type_ = UnaryOpType::Swish; + beta = swish.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const Sigmoid&) { unary_op_type_ = UnaryOpType::Sigmoid; } + + __host__ __device__ DynamicUnaryOp(const Sigmoid&&) { unary_op_type_ = UnaryOpType::Sigmoid; } + + __host__ __device__ DynamicUnaryOp(const PassThrough&) + { + unary_op_type_ = UnaryOpType::PassThrough; + } + + __host__ __device__ DynamicUnaryOp(const PassThrough&&) + { + unary_op_type_ = UnaryOpType::PassThrough; + } + + __host__ __device__ DynamicUnaryOp(const Logistic& logistic) + { + unary_op_type_ = UnaryOpType::Logistic; + alpha = logistic.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Logistic&& logistic) + { + unary_op_type_ = UnaryOpType::Logistic; + alpha = logistic.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const TanH&) { unary_op_type_ = UnaryOpType::TanH; } + + __host__ __device__ DynamicUnaryOp(const TanH&&) { unary_op_type_ = UnaryOpType::TanH; } + + __host__ __device__ DynamicUnaryOp(const Relu&) { unary_op_type_ = UnaryOpType::Relu; } + + __host__ __device__ DynamicUnaryOp(const Relu&&) { unary_op_type_ = UnaryOpType::Relu; } + + __host__ __device__ DynamicUnaryOp(const SoftRelu& softrelu) + { + unary_op_type_ = UnaryOpType::SoftRelu; + alpha = softrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const SoftRelu&& softrelu) + { + unary_op_type_ = UnaryOpType::SoftRelu; + alpha = softrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const UnaryAbs&) { unary_op_type_ = UnaryOpType::UnaryAbs; } + + __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) { unary_op_type_ = UnaryOpType::UnaryAbs; } + + __host__ __device__ DynamicUnaryOp(const Power& pow) + { + unary_op_type_ = UnaryOpType::Power; + alpha = pow.get_alpha(); + beta = pow.get_beta(); + gamma = pow.get_gamma(); + } + + __host__ __device__ DynamicUnaryOp(const Power&& pow) + { + unary_op_type_ = UnaryOpType::Power; + alpha = pow.get_alpha(); + beta = pow.get_beta(); + gamma = pow.get_gamma(); + } + + __host__ __device__ DynamicUnaryOp(const ClippedRelu& clippedrelu) + { + unary_op_type_ = UnaryOpType::ClippedRelu; + alpha = clippedrelu.get_alpha(); + beta = clippedrelu.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const ClippedRelu&& clippedrelu) + { + unary_op_type_ = UnaryOpType::ClippedRelu; + alpha = clippedrelu.get_alpha(); + beta = clippedrelu.get_beta(); + } + + __host__ __device__ DynamicUnaryOp(const LeakyRelu& leakyrelu) + { + unary_op_type_ = UnaryOpType::LeakyRelu; + alpha = leakyrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const LeakyRelu&& leakyrelu) + { + unary_op_type_ = UnaryOpType::LeakyRelu; + alpha = leakyrelu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Elu& elu) + { + unary_op_type_ = UnaryOpType::Elu; + alpha = elu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const Elu&& elu) + { + unary_op_type_ = UnaryOpType::Elu; + alpha = elu.get_alpha(); + } + + __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op) + : unary_op_type_(dynamic_op.unary_op_type_), + unary_op_ptr_(dynamic_op.unary_op_ptr_), + alpha(dynamic_op.alpha), + beta(dynamic_op.beta), + gamma(dynamic_op.gamma) + { + } + + __host__ __device__ ~DynamicUnaryOp() + { + if(unary_op_ptr_) + delete unary_op_ptr_; + } + + __device__ void InitUnaryOpPtrOnDevice() + { + switch(unary_op_type_) + { + case(UnaryOpType::Swish): unary_op_ptr_ = new Swish(beta); break; + case(UnaryOpType::Sigmoid): unary_op_ptr_ = new Sigmoid; break; + case(UnaryOpType::PassThrough): unary_op_ptr_ = new PassThrough; break; + case(UnaryOpType::Logistic): unary_op_ptr_ = new Logistic(alpha); break; + case(UnaryOpType::TanH): unary_op_ptr_ = new TanH; break; + case(UnaryOpType::Relu): unary_op_ptr_ = new Relu; break; + case(UnaryOpType::SoftRelu): unary_op_ptr_ = new SoftRelu(alpha); break; + case(UnaryOpType::UnaryAbs): unary_op_ptr_ = new UnaryAbs; break; + case(UnaryOpType::Power): unary_op_ptr_ = new Power(alpha, beta, gamma); break; + case(UnaryOpType::ClippedRelu): unary_op_ptr_ = new ClippedRelu(alpha, beta); break; + case(UnaryOpType::LeakyRelu): unary_op_ptr_ = new LeakyRelu(alpha); break; + case(UnaryOpType::Elu): unary_op_ptr_ = new Elu(alpha); break; + + default: unary_op_ptr_ = nullptr; break; + } + } + + template + __device__ void operator()(Y& y, const X& x) const + { + isSupported(); + unary_op_ptr_->operator()(y, x); + } + + template + __host__ void operator()(Y& y, const X& x) const + { + isSupported(); + switch(unary_op_type_) + { + case(UnaryOpType::Swish): Swish{}.operator()(y, x); break; + case(UnaryOpType::Sigmoid): Sigmoid{}.operator()(y, x); break; + case(UnaryOpType::PassThrough): PassThrough{}.operator()(y, x); break; + case(UnaryOpType::Logistic): Logistic{}.operator()(y, x); break; + case(UnaryOpType::TanH): TanH{}.operator()(y, x); break; + case(UnaryOpType::Relu): Relu{}.operator()(y, x); break; + case(UnaryOpType::SoftRelu): SoftRelu{}.operator()(y, x); break; + case(UnaryOpType::UnaryAbs): UnaryAbs{}.operator()(y, x); break; + case(UnaryOpType::Power): Power{}.operator()(y, x); break; + case(UnaryOpType::ClippedRelu): ClippedRelu{}.operator()(y, x); break; + case(UnaryOpType::LeakyRelu): LeakyRelu{}.operator()(y, x); break; + case(UnaryOpType::Elu): Elu{}.operator()(y, x); break; + default: break; + } + } + + template + __device__ __host__ constexpr void isSupported() const + { + + static_assert(std::is_same::value, "X and Y must be of the same type"); + + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, + "Data type is not supported by this operation!"); + } + + private: + enum class UnaryOpType + { + Swish, + Sigmoid, + PassThrough, + Logistic, + TanH, + Relu, + SoftRelu, + UnaryAbs, + Power, + ClippedRelu, + LeakyRelu, + Elu + }; + + public: + UnaryOpType unary_op_type_; + UnaryOpBase* unary_op_ptr_ = nullptr; + float alpha; + float beta; + float gamma; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp new file mode 100644 index 0000000000..9db675a515 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using F32 = float; + +template +using S = ck::Sequence; + +using namespace ck::tensor_layout::convolution; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto ConvFwd1x1P0 = ConvolutionForwardSpecialization::Filter1x1Pad0; + +static constexpr auto ConvFwd1x1S1P0 = ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; + +static constexpr auto ConvFwdOddC = + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + +static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, BF16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F16, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> + // clang-format on + >; + +template +using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple< + // clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, int8_t, PassThrough, PassThrough, DynamicUnaryOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp new file mode 100644 index 0000000000..5efee69b2f --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dynamic.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DynamicUnaryOp = ck::tensor_operation::element_wise::DynamicUnaryOp; + +#ifdef CK_ENABLE_BF16 +// grouped conv2d forward, NHWGC/GKYXC/NHWGK +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector, + NHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector, + NHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector, + NHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector, + NHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif +#ifdef CK_ENABLE_BF16 +// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP16 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_FP32 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +#ifdef CK_ENABLE_INT8 +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + std::vector, + NDHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances); +#endif + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = + DeviceGroupedConvFwdMultipleABD; + + static auto GetInstances() + { + std::vector> op_ptrs; + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v && + DLayouts::Size() == 0) + { +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + op_ptrs); + } +#endif + } + else if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v && + DLayouts::Size() == 0) + { +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + op_ptrs); + } +#endif + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt new file mode 100644 index 0000000000..92735fcaeb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/CMakeLists.txt @@ -0,0 +1,8 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV2D_FWD_DYNAMIC_OP + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp + xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp) + +add_instance_library(device_grouped_conv2d_fwd_dynamic_op_instance ${GROUPED_CONV2D_FWD_DYNAMIC_OP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp new file mode 100644 index 0000000000..853470e1c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector, + NHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp new file mode 100644 index 0000000000..725b9ca0d7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector, + NHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp new file mode 100644 index 0000000000..fbd5fe3700 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector, + NHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp new file mode 100644 index 0000000000..6bfc29537e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_dynamic_op/xdl/device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +void add_device_grouped_conv2d_fwd_xdl_dynamic_op_nhwgc_gkyxc_nhwgk_int8_instances( + std::vector, + NHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<2, + NHWGC, + GKYXC, + Tuple<>, + NHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt new file mode 100644 index 0000000000..3b8ebbffd1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/CMakeLists.txt @@ -0,0 +1,8 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV3D_FWD_DYNAMIC_OP + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp) + +add_instance_library(device_grouped_conv3d_fwd_dynamic_op_instance ${GROUPED_CONV3D_FWD_DYNAMIC_OP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp new file mode 100644 index 0000000000..249dfaa4d8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + ck::Tuple<>, + BF16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp new file mode 100644 index 0000000000..75c4ddc35d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + ck::Tuple<>, + F16, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f16_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp new file mode 100644 index 0000000000..2e237e07bf --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_f32_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp new file mode 100644 index 0000000000..e38f1acbd6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_dynamic_op/xdl/device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +void add_device_grouped_conv3d_fwd_xdl_dynamic_op_ndhwgc_gkzyxc_ndhwgk_int8_instances( + std::vector, + NDHWGK, + int8_t, + int8_t, + ck::Tuple<>, + int8_t, + PassThrough, + PassThrough, + DynamicUnaryOp>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwdDefault>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1P0>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_dynamic_op_int8_instances<3, + NDHWGC, + GKZYXC, + Tuple<>, + NDHWGK, + ConvFwd1x1S1P0>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck