From 3190273630f1da55356aec729986546d21883747 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Thu, 25 Aug 2022 17:08:43 -0500 Subject: [PATCH] Add int4 example for convnd_fwd_bias_relu_add (#375) * Add int4 example for convnd_fwd_bias_relu_add * Fix AddReluAdd for building without int4 support * Update CMakeLists.txt * Format * Convert int4 tensors for int8 kernel * Fix device memory allocation * Format * Format [ROCm/composable_kernel commit: b73ae2423495a9054ceaec4d529d30db7e089743] --- .../CMakeLists.txt | 8 +- ...rouped_convnd_fwd_bias_relu_add_common.hpp | 73 ++- ...uped_convnd_fwd_bias_relu_add_xdl_bf16.cpp | 55 ++- ...uped_convnd_fwd_bias_relu_add_xdl_fp16.cpp | 55 ++- ...uped_convnd_fwd_bias_relu_add_xdl_fp32.cpp | 55 ++- ...uped_convnd_fwd_bias_relu_add_xdl_int4.cpp | 459 ++++++++++++++++++ ...uped_convnd_fwd_bias_relu_add_xdl_int8.cpp | 55 ++- .../gpu/element/element_wise_operation.hpp | 12 + 8 files changed, 665 insertions(+), 107 deletions(-) create mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt index 628cb93daa..98c2211b19 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt +++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt @@ -1,11 +1,11 @@ add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp) -target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility) add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp) -target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility) add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp) -target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility) add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp) -target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility) \ No newline at end of file + +if(USE_BITINT_EXTENSION_INT4) + add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp) +endif() # USE_BITINT_EXTENSION_INT4 diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp index 3fb62e77e2..a2d9c21287 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp @@ -26,13 +26,16 @@ void print_helper_msg() } template int run_grouped_conv_fwd_bias_relu_add(bool do_verification, int init_method, @@ -47,12 +50,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, const WeiElementOp& wei_element_op, const OutElementOp& out_element_op) { - Tensor in(in_g_n_c_wis_desc); - Tensor wei(wei_g_k_c_xs_desc); - Tensor bias(bias_g_n_k_wos_desc); - Tensor residual(residual_g_n_k_wos_desc); - Tensor out_host(out_g_n_k_wos_desc); - Tensor out_device(out_g_n_k_wos_desc); + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor bias(bias_g_n_k_wos_desc); + Tensor residual(residual_g_n_k_wos_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); std::cout << "in: " << in.mDesc << std::endl; std::cout << "wei: " << wei.mDesc << std::endl; @@ -64,26 +67,38 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, { case 0: break; case 1: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; default: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); } - DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); - DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize()); - DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize()); + DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize()); +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + const Tensor in_converted(in); + const Tensor wei_converted(wei); + const Tensor bias_converted(bias); + const Tensor residual_converted(residual); + + in_device_buf.ToDevice(in_converted.mData.data()); + wei_device_buf.ToDevice(wei_converted.mData.data()); + bias_device_buf.ToDevice(bias_converted.mData.data()); + residual_device_buf.ToDevice(residual_converted.mData.data()); +#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 in_device_buf.ToDevice(in.mData.data()); wei_device_buf.ToDevice(wei.mData.data()); bias_device_buf.ToDevice(bias.mData.data()); residual_device_buf.ToDevice(residual.mData.data()); +#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 std::array a_g_n_c_wis_lengths{}; std::array a_g_n_c_wis_strides{}; @@ -154,7 +169,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = conv_param.GetFlops(); - std::size_t num_btype = conv_param.GetByte(); + std::size_t num_btype = conv_param.GetByte(); float tflops = static_cast(flop) / 1.E9 / avg_time; float gb_per_sec = num_btype / 1.E6 / avg_time; @@ -168,8 +183,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, Tensor c_host(out_g_n_k_wos_desc); auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd out_device_converted(out_device); + + return ck::utils::check_err(out_device_converted.mData, + out_host.mData, + "Error: incorrect results!", + 1e-5f, + 1e-4f) + ? 0 + : 1; +#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 return ck::utils::check_err( out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1; +#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 } return 0; diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp index 1da96b2d37..4ac996dbaa 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp @@ -7,13 +7,19 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" -using InDataType = ck::bhalf_t; -using WeiDataType = ck::bhalf_t; -using AccDataType = float; -using CShuffleDataType = float; -using BiasDataType = ck::bhalf_t; -using ResidualDataType = ck::bhalf_t; -using OutDataType = ck::bhalf_t; +// kernel data types +using InKernelDataType = ck::bhalf_t; +using WeiKernelDataType = ck::bhalf_t; +using AccDataType = float; +using CShuffleDataType = float; +using BiasKernelDataType = ck::bhalf_t; +using ResidualKernelDataType = ck::bhalf_t; +using OutKernelDataType = ck::bhalf_t; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; template using S = ck::Sequence; @@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance = WeiLayout, ck::Tuple, OutLayout, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, AccDataType, CShuffleDataType, - ck::Tuple, - OutDataType, + ck::Tuple, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, @@ -181,13 +187,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<1, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, @@ -290,13 +299,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<2, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, @@ -413,13 +425,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<3, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp index d505073f28..8846633982 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp @@ -7,13 +7,19 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" -using InDataType = ck::half_t; -using WeiDataType = ck::half_t; -using AccDataType = float; -using CShuffleDataType = ck::half_t; -using BiasDataType = ck::half_t; -using ResidualDataType = ck::half_t; -using OutDataType = ck::half_t; +// kernel data types +using InKernelDataType = ck::half_t; +using WeiKernelDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using BiasKernelDataType = ck::half_t; +using ResidualKernelDataType = ck::half_t; +using OutKernelDataType = ck::half_t; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; template using S = ck::Sequence; @@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance = WeiLayout, ck::Tuple, OutLayout, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, AccDataType, CShuffleDataType, - ck::Tuple, - OutDataType, + ck::Tuple, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, @@ -181,13 +187,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<1, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, @@ -290,13 +299,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<2, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, @@ -413,13 +425,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<3, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp index 5237a9cb5a..c792ac5fe3 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp @@ -7,13 +7,19 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" -using InDataType = float; -using WeiDataType = float; -using AccDataType = float; -using CShuffleDataType = float; -using BiasDataType = float; -using ResidualDataType = float; -using OutDataType = float; +// kernel data types +using InKernelDataType = float; +using WeiKernelDataType = float; +using AccDataType = float; +using CShuffleDataType = float; +using BiasKernelDataType = float; +using ResidualKernelDataType = float; +using OutKernelDataType = float; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; template using S = ck::Sequence; @@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance = WeiLayout, ck::Tuple, OutLayout, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, AccDataType, CShuffleDataType, - ck::Tuple, - OutDataType, + ck::Tuple, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, @@ -181,13 +187,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<1, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, @@ -290,13 +299,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<2, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, @@ -413,13 +425,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<3, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp new file mode 100644 index 0000000000..d989e63590 --- /dev/null +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "grouped_convnd_fwd_bias_relu_add_common.hpp" + +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" + +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +// kernel data types +using InKernelDataType = int8_t; +using WeiKernelDataType = int8_t; +using AccDataType = int32_t; +using CShuffleDataType = int8_t; +using BiasKernelDataType = int8_t; +using ResidualKernelDataType = int8_t; +using OutKernelDataType = int8_t; + +// tensor data types +using InUserDataType = ck::int4_t; +using WeiUserDataType = ck::int4_t; +using OutUserDataType = ck::int4_t; + +template +using S = ck::Sequence; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InKernelDataType, + WeiKernelDataType, + AccDataType, + CShuffleDataType, + ck::Tuple, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 64, // KPerBlock + 16, // AK1 + 16, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 16, // ABlockTransferSrcScalarPerVector + 16, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 16, // BBlockTransferSrcScalarPerVector + 16, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 64, 1, 4>, + 16>; + +int main(int argc, char* argv[]) +{ + namespace ctc = ck::tensor_layout::convolution; + + print_helper_msg(); + + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // conventional group conv definition + // G = 2 + // [N, C, Hi, Wi] = [128, 384, 71, 71] + // [K, C, Y, X] = [512, 192, 3, 3] + // [N, K, Ho, Wo] = [128, 512, 36, 36] + // CK group conv definition + // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] + // [G, K, C, Y, X] = [2, 256, 192, 3, 3] + // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] + ck::utils::conv::ConvParam conv_param{ + 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; + + if(argc == 1) + { + // use default + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + + conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); + } + + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + const auto out_element_op = OutElementOp{}; + + if(conv_param.num_dim_spatial_ == 1) + { + using InLayout = ctc::G_NW_C; + using WeiLayout = ctc::G_K_X_C; + using BiasLayout = ctc::G_NW_K; + using ResidualLayout = ctc::G_NW_K; + using OutLayout = ctc::G_NW_K; + + const auto in_g_n_c_wis_desc = HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.G_ * conv_param.C_ // wi + }); + + const auto wei_g_k_c_xs_desc = HostTensorDescriptor( + {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k + 1, // c + conv_param.C_ // x + }); + + const auto bias_g_n_k_wos_desc = HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + 0, // k + 1, // c + 0 // x + }); + + const auto residual_g_n_k_wos_desc = HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + 0, // k + 1, // c + 0 // x + }); + + const auto out_g_n_k_wos_desc = HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.G_ * conv_param.K_ // wo + }); + + return run_grouped_conv_fwd_bias_relu_add<1, + InKernelDataType, + WeiKernelDataType, + CShuffleDataType, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, + DeviceGroupedConvNDFwdInstance<1, + InLayout, + WeiLayout, + BiasLayout, + ResidualLayout, + OutLayout>>( + do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + bias_g_n_k_wos_desc, + residual_g_n_k_wos_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); + } + else if(conv_param.num_dim_spatial_ == 2) + { + using InLayout = ctc::G_NHW_C; + using WeiLayout = ctc::G_K_YX_C; + using BiasLayout = ctc::G_NHW_K; + using ResidualLayout = ctc::G_NHW_K; + using OutLayout = ctc::G_NHW_K; + + const auto in_g_n_c_wis_desc = HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + + const auto wei_g_k_c_xs_desc = + HostTensorDescriptor({conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y + conv_param.C_ // x + }); + + const auto bias_g_n_k_wos_desc = + HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // ho + 0 // wo + }); + + const auto residual_g_n_k_wos_desc = + HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // ho + 0 // wo + }); + + const auto out_g_n_k_wos_desc = HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + + return run_grouped_conv_fwd_bias_relu_add<2, + InKernelDataType, + WeiKernelDataType, + CShuffleDataType, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, + DeviceGroupedConvNDFwdInstance<2, + InLayout, + WeiLayout, + BiasLayout, + ResidualLayout, + OutLayout>>( + do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + bias_g_n_k_wos_desc, + residual_g_n_k_wos_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); + } + else if(conv_param.num_dim_spatial_ == 3) + { + using InLayout = ctc::G_NDHW_C; + using WeiLayout = ctc::G_K_ZYX_C; + using BiasLayout = ctc::G_NDHW_K; + using ResidualLayout = ctc::G_NDHW_K; + using OutLayout = ctc::G_NDHW_K; + + const auto in_g_n_c_wis_desc = HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1], + conv_param.input_spatial_lengths_[2]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * + conv_param.G_ * conv_param.C_, // di + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + + const auto wei_g_k_c_xs_desc = HostTensorDescriptor( + {conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1], + conv_param.filter_spatial_lengths_[2]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // z + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y + conv_param.C_ // x + }); + + const auto bias_g_n_k_wos_desc = + HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // z + 0, // y + 0 // x + }); + + const auto residual_g_n_k_wos_desc = + HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // z + 0, // y + 0 // x + }); + + const auto out_g_n_k_wos_desc = HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * + conv_param.G_ * conv_param.K_, // do + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + + return run_grouped_conv_fwd_bias_relu_add<3, + InKernelDataType, + WeiKernelDataType, + CShuffleDataType, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, + DeviceGroupedConvNDFwdInstance<3, + InLayout, + WeiLayout, + BiasLayout, + ResidualLayout, + OutLayout>>( + do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + bias_g_n_k_wos_desc, + residual_g_n_k_wos_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); + } + + return 0; +} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp index 859c9cea34..9aabe86948 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp @@ -7,13 +7,19 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" -using InDataType = int8_t; -using WeiDataType = int8_t; -using AccDataType = int32_t; -using CShuffleDataType = int8_t; -using BiasDataType = int8_t; -using ResidualDataType = int8_t; -using OutDataType = int8_t; +// kernel data types +using InKernelDataType = int8_t; +using WeiKernelDataType = int8_t; +using AccDataType = int32_t; +using CShuffleDataType = int8_t; +using BiasKernelDataType = int8_t; +using ResidualKernelDataType = int8_t; +using OutKernelDataType = int8_t; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; template using S = ck::Sequence; @@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance = WeiLayout, ck::Tuple, OutLayout, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, AccDataType, CShuffleDataType, - ck::Tuple, - OutDataType, + ck::Tuple, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, @@ -181,13 +187,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<1, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, @@ -290,13 +299,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<2, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, @@ -413,13 +425,16 @@ int main(int argc, char* argv[]) }); return run_grouped_conv_fwd_bias_relu_add<3, - InDataType, - WeiDataType, + InKernelDataType, + WeiKernelDataType, CShuffleDataType, - OutDataType, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, + InUserDataType, + WeiUserDataType, + OutUserDataType, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 44cd5c0694..47d018095d 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -98,6 +98,18 @@ struct AddReluAdd int32_t c = b + x2; y = c; } + +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + template <> + __host__ __device__ constexpr void operator()( + int4_t& y, const int8_t& x0, const int4_t& x1, const int4_t& x2) const + { + int32_t a = x0 + x1; + int32_t b = a > 0 ? a : 0; + int32_t c = b + x2; + y = c; + } +#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 }; struct AddHardswishAdd