From d6dd154286b162920b387cfd272dc5b42ab351f0 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Mon, 15 Aug 2022 23:11:02 +0800 Subject: [PATCH] Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320) * Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device [ROCm/composable_kernel commit: 53ea4713af15e43f5b11816f20c56f6fc9c7611f] --- .../gemm_add_add_layernorm.cpp | 11 +- .../broadcast_add_2d_amn_bn.cpp | 56 +- .../broadcast_add_3d_am_bmnk.cpp | 73 +-- .../elementwise_add_1d.cpp | 58 +- .../elementwise_add_4d.cpp | 70 +-- .../gemm_bias_relu_add_layernorm_xdl_fp16.cpp | 76 +-- .../gemm_layernorm_xdl_fp16.cpp | 79 ++- example/33_multiple_reduce/CMakeLists.txt | 2 + example/33_multiple_reduce/README.md | 37 ++ .../33_multiple_reduce/dual_reduce_common.hpp | 313 +++++++++ .../dual_reduce_multiblock.cpp | 98 +++ .../dual_reduce_threadwise.cpp | 93 +++ example/34_batchnorm/CMakeLists.txt | 2 + example/34_batchnorm/README.md | 56 ++ example/34_batchnorm/batchnorm_common.hpp | 181 ++++++ .../34_batchnorm/batchnorm_forward_impl.hpp | 295 +++++++++ .../34_batchnorm/batchnorm_forward_nhwc.cpp | 466 ++++++++++++++ example/34_batchnorm/batchnorm_infer_impl.hpp | 119 ++++ example/34_batchnorm/batchnorm_infer_nhwc.cpp | 346 ++++++++++ example/CMakeLists.txt | 3 + .../gpu/device/device_5ary_elementwise.hpp | 353 ----------- .../gpu/device/device_batchnorm_forward.hpp | 44 ++ .../gpu/device/device_batchnorm_infer.hpp | 41 ++ .../gpu/device/device_binary_elementwise.hpp | 247 -------- .../device_cgemm_4gemm_xdl_cshuffle.hpp | 177 +++--- ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 1 - ...nd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp | 1 - .../gpu/device/device_elementwise.hpp | 294 ++++++++- .../gpu/device/device_elementwise_base.hpp | 45 ++ .../gpu/device/device_multiple_reduce.hpp | 58 ++ .../device_multiple_reduce_multiblock.hpp | 595 ++++++++++++++++++ .../device_multiple_reduce_threadwise.hpp | 422 +++++++++++++ .../gpu/device/device_reduce_common.hpp | 52 ++ .../gpu/device/device_unary_elementwise.hpp | 183 ------ .../gpu/element/element_wise_operation.hpp | 57 +- ...dwise_2d_multiple_reduction_multiblock.hpp | 321 ++++++++++ ...dwise_2d_multiple_reduction_threadwise.hpp | 264 ++++++++ .../gpu/grid/gridwise_5ary_Elementwise_1d.hpp | 254 -------- .../grid/gridwise_binary_elementwise_1d.hpp | 155 ----- .../gpu/grid/gridwise_elementwise_1d.hpp | 191 ++++++ .../gridwise_set_multiple_buffer_value.hpp | 86 +++ .../grid/gridwise_unary_elementwise_1d.hpp | 132 ---- .../reference_batchnorm_forward_nhwc_c.hpp | 259 ++++++++ .../cpu/reference_batchnorm_infer_nhwc_c.hpp | 191 ++++++ .../gpu/device_elementwise_instance.hpp | 9 +- .../library/utility/host_tensor_generator.hpp | 18 + .../elementwise/device_normalize_instance.cpp | 18 +- 47 files changed, 5195 insertions(+), 1707 deletions(-) create mode 100644 example/33_multiple_reduce/CMakeLists.txt create mode 100644 example/33_multiple_reduce/README.md create mode 100644 example/33_multiple_reduce/dual_reduce_common.hpp create mode 100644 example/33_multiple_reduce/dual_reduce_multiblock.cpp create mode 100644 example/33_multiple_reduce/dual_reduce_threadwise.cpp create mode 100644 example/34_batchnorm/CMakeLists.txt create mode 100644 example/34_batchnorm/README.md create mode 100644 example/34_batchnorm/batchnorm_common.hpp create mode 100644 example/34_batchnorm/batchnorm_forward_impl.hpp create mode 100644 example/34_batchnorm/batchnorm_forward_nhwc.cpp create mode 100644 example/34_batchnorm/batchnorm_infer_impl.hpp create mode 100644 example/34_batchnorm/batchnorm_infer_nhwc.cpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp index 8f14293728..9b157f29a1 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp @@ -128,11 +128,14 @@ bool RunDeviceNormalize2D(normalize_op_ptr& p_op, std::array output = {p_y}; auto normalize_functor = ck::tensor_operation::element_wise::Normalize{}; - auto argument_ptr = p_op->MakeArgumentPointer(input, + std::array xyLengths = {M, N}; + std::array xyStrides = {StrideX, 1}; + + auto argument_ptr = p_op->MakeArgumentPointer(xyLengths, + {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}}, + {xyStrides}, + input, output, - {M, N}, - {{StrideX, 1}, {1, 0}, {1, 0}, {0, 1}, {0, 1}}, - {{StrideX, 1}}, ck::tensor_operation::element_wise::Normalize{}); if(p_op->IsSupportedArgument(argument_ptr.get())) diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 58ee6f7537..50604da18e 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -16,28 +16,23 @@ using F16 = ck::half_t; using F32 = float; -using ABDataType = F16; -using CDataType = F16; -using EltwiseComputeDataType = F32; +using ABDataType = F16; +using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceBinaryElementwise; + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + Add, + 2, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; template void host_broadcast2D( @@ -49,19 +44,19 @@ void host_broadcast2D( { for(int n = 0; n < N; ++n) { - ComputeDataType Amn = ck::type_convert(A(m, n)); - ComputeDataType Cmn = 0; + auto Amn = A(m, n); + ctype Cmn = 0; if constexpr(broadcastDim == 0) { - ComputeDataType Bn = ck::type_convert(B(n)); + auto Bn = B(n); functor(Cmn, Amn, Bn); } else { - ComputeDataType Bm = ck::type_convert(B(m)); + auto Bm = B(m); functor(Cmn, Amn, Bm); } - C(m, n) = ck::type_convert(Cmn); + C(m, n) = Cmn; } } } @@ -103,18 +98,19 @@ int main() b_n_device_buf.GetDeviceBuffer()}; std::array output = {c_m_n_device_buf.GetDeviceBuffer()}; - std::vector a_strides = {Stride, 1}; - std::vector b_strides = {0, 1}; - std::vector c_strides = {Stride, 1}; + std::array abc_lengths = {M, N}; + std::array a_strides = {Stride, 1}; + std::array b_strides = {0, 1}; + std::array c_strides = {Stride, 1}; auto broadcastAdd = DeviceElementwiseAddInstance{}; auto argument = broadcastAdd.MakeArgumentPointer( - input, output, {M, N}, {a_strides, b_strides}, {c_strides}, Add{}); + abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceBinaryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); }; auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); @@ -129,12 +125,8 @@ int main() c_m_n_device_buf.FromDevice(c_m_n.mData.data()); Tensor host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride)); - host_broadcast2D, - Tensor, - Tensor, - EltwiseComputeDataType, - Add, - 0>(host_c_m_n, a_m_n, b_n, M, N, Add{}); + host_broadcast2D, Tensor, Tensor, Add, 0>( + host_c_m_n, a_m_n, b_n, M, N, Add{}); pass &= ck::utils::check_err( c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results c", 1e-3, 1e-3); diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp index ac44673d56..9f2e1e7850 100644 --- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp +++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -16,29 +16,21 @@ using F16 = ck::half_t; using F32 = float; -using ABDataType = F16; -using CDataType = F16; -using EltwiseComputeDataType = F32; +using ABDataType = F16; +using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceBinaryElementwise; + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + Add, + 3, + 8, + ck::Sequence<1, 8>, + ck::Sequence<8>>; -template +template void host_broadcast3D_am_bmnk(HostTensorC& C, const HostTensorA& A, const HostTensorB& B, @@ -51,11 +43,11 @@ void host_broadcast3D_am_bmnk(HostTensorC& C, for(std::size_t n = 0; n < shape[1]; ++n) for(std::size_t k = 0; k < shape[2]; ++k) { - ComputeDataType a_val = ck::type_convert(A(m)); - ComputeDataType b_val = ck::type_convert(B(m, n, k)); - ComputeDataType c_val = 0; + auto a_val = A(m); + auto b_val = B(m, n, k); + ctype c_val = 0; functor(c_val, a_val, b_val); - C(m, n, k) = ck::type_convert(c_val); + C(m, n, k) = c_val; } } @@ -85,25 +77,25 @@ int main() b_m_n_k_device_buf.GetDeviceBuffer()}; std::array output = {c_m_n_k_device_buf.GetDeviceBuffer()}; - std::vector a_strides = {1, 0, 0}; - std::vector b_strides{b_m_n_k.mDesc.GetStrides().begin(), - b_m_n_k.mDesc.GetStrides().end()}; - std::vector c_strides{c_m_n_k.mDesc.GetStrides().begin(), - c_m_n_k.mDesc.GetStrides().end()}; + std::array abc_lengths; + std::array a_strides = {1, 0, 0}; + std::array b_strides; + std::array c_strides; + + std::copy(mnk.begin(), mnk.end(), abc_lengths.begin()); + std::copy( + b_m_n_k.mDesc.GetStrides().begin(), b_m_n_k.mDesc.GetStrides().end(), b_strides.begin()); + std::copy( + c_m_n_k.mDesc.GetStrides().begin(), c_m_n_k.mDesc.GetStrides().end(), c_strides.begin()); auto broadcastAdd = DeviceElementwiseAddInstance{}; - auto argument = - broadcastAdd.MakeArgumentPointer(input, - output, - std::vector{mnk.begin(), mnk.end()}, - {a_strides, b_strides}, - {c_strides}, - Add{}); + auto argument = broadcastAdd.MakeArgumentPointer( + abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceBinaryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); }; auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); @@ -118,11 +110,8 @@ int main() c_m_n_k_device_buf.FromDevice(c_m_n_k.mData.data()); Tensor host_c_m_n_k(mnk); - host_broadcast3D_am_bmnk, - Tensor, - Tensor, - EltwiseComputeDataType, - Add>(host_c_m_n_k, a_m, b_m_n_k, mnk, Add{}); + host_broadcast3D_am_bmnk, Tensor, Tensor, Add>( + host_c_m_n_k, a_m, b_m_n_k, mnk, Add{}); pass &= ck::utils::check_err( c_m_n_k.mData, host_c_m_n_k.mData, "Error: Incorrect results c", 1e-3, 1e-3); diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index 18c12c3e4d..d123798fef 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -5,7 +5,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -15,29 +15,21 @@ using F16 = ck::half_t; using F32 = float; -using ABDataType = F16; -using CDataType = F16; -using EltwiseComputeDataType = F32; +using ABDataType = F16; +using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceBinaryElementwise; + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + Add, + 1, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; -template +template void host_elementwise1D( HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor) { @@ -45,11 +37,11 @@ void host_elementwise1D( for(int m = 0; m < M; ++m) { - ComputeDataType Am = ck::type_convert(A(m)); - ComputeDataType Bm = ck::type_convert(B(m)); - ComputeDataType Cm = 0; + auto Am = A(m); + auto Bm = B(m); + ctype Cm = 0; functor(Cm, Am, Bm); - C(m) = ck::type_convert(Cm); + C(m) = Cm; } } @@ -83,18 +75,19 @@ int main() b_m_device_buf.GetDeviceBuffer()}; std::array output = {c_m_device_buf.GetDeviceBuffer()}; - std::vector a_strides = {1}; - std::vector b_strides = {1}; - std::vector c_strides = {1}; + std::array abc_lengths = {M}; + std::array a_strides = {1}; + std::array b_strides = {1}; + std::array c_strides = {1}; auto broadcastAdd = DeviceElementwiseAddInstance{}; auto argument = broadcastAdd.MakeArgumentPointer( - input, output, {M}, {{a_strides}, b_strides}, {c_strides}, Add{}); + abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceBinaryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); }; auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); @@ -109,11 +102,8 @@ int main() c_m_device_buf.FromDevice(c_m.mData.data()); Tensor host_c_m(f_host_tensor_descriptor1d(M, 1)); - host_elementwise1D, - Tensor, - Tensor, - EltwiseComputeDataType, - Add>(host_c_m, a_m, b_m, M, Add{}); + host_elementwise1D, Tensor, Tensor, Add>( + host_c_m, a_m, b_m, M, Add{}); pass &= ck::utils::check_err( c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3); diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index 9817208ae4..4c74526940 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -16,29 +16,21 @@ using F16 = ck::half_t; using F32 = float; -using ABDataType = F16; -using CDataType = F16; -using EltwiseComputeDataType = F32; +using ABDataType = F16; +using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceBinaryElementwise; + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + Add, + 4, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; -template +template void host_elementwise4D(HostTensorC& C, const HostTensorA& A, const HostTensorB& B, @@ -52,11 +44,11 @@ void host_elementwise4D(HostTensorC& C, for(std::size_t h = 0; h < shape[2]; ++h) for(std::size_t w = 0; w < shape[3]; ++w) { - ComputeDataType a_val = ck::type_convert(A(n, c, h, w)); - ComputeDataType b_val = ck::type_convert(B(n, c, h, w)); - ComputeDataType c_val = 0; + auto a_val = A(n, c, h, w); + auto b_val = B(n, c, h, w); + ctype c_val = 0; functor(c_val, a_val, b_val); - C(n, c, h, w) = ck::type_convert(c_val); + C(n, c, h, w) = c_val; } } @@ -85,23 +77,24 @@ int main() b_device_buf.GetDeviceBuffer()}; std::array output = {c_device_buf.GetDeviceBuffer()}; - std::vector a_strides{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}; - std::vector b_strides{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}; - std::vector c_strides{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()}; + std::array abc_lengths; + std::array a_strides; + std::array b_strides; + std::array c_strides; + + std::copy(nchw.begin(), nchw.end(), abc_lengths.begin()); + std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin()); + std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin()); + std::copy(c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end(), c_strides.begin()); auto broadcastAdd = DeviceElementwiseAddInstance{}; - auto argument = - broadcastAdd.MakeArgumentPointer(input, - output, - std::vector{nchw.begin(), nchw.end()}, - {{a_strides}, b_strides}, - {c_strides}, - Add{}); + auto argument = broadcastAdd.MakeArgumentPointer( + abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceBinaryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); }; auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); @@ -116,11 +109,8 @@ int main() c_device_buf.FromDevice(c.mData.data()); Tensor host_c(nchw); - host_elementwise4D, - Tensor, - Tensor, - EltwiseComputeDataType, - Add>(host_c, a, b, nchw, Add{}); + host_elementwise4D, Tensor, Tensor, Add>( + host_c, a, b, nchw, Add{}); pass &= ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results c", 1e-3, 1e-3); diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp index 8a3c12f6c8..d4fbcfb994 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp @@ -10,7 +10,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" @@ -94,23 +94,18 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; // scalarPerVector: LayerNorm_out +using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // x(gemm_out), mean, meansquare, gamma, beta + ck::Tuple, // y + NormalizeFunctor, + 2, + 8, // MPerthread + ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta + ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { return HostTensorDescriptor(std::vector({len}), @@ -197,14 +192,9 @@ void host_gemm_layernorm(Tensor& out_m_n, { for(int n = 0; n < N; ++n) { - NormalizeComputeDataType out_acc = 0; - layerNormInst(out_acc, - ck::type_convert(e_m_n(m, n)), - ck::type_convert(mean_m(m)), - ck::type_convert(meanSquare_m(m)), - ck::type_convert(gamma_n(n)), - ck::type_convert(beta_n(n))); - out_m_n(m, n) = ck::type_convert(out_acc); + LayerNormOutDataType out_val = 0; + layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n)); + out_m_n(m, n) = out_val; } } } @@ -339,28 +329,28 @@ int main() beta_device_buf.GetDeviceBuffer()}; std::array output = {layerNorm_device_buf.GetDeviceBuffer()}; - auto normalize = DeviceNormalizeInstance{}; - auto normalize_invoker = normalize.MakeInvoker(); - auto normalize_argument = normalize.MakeArgument(input, - output, - {M, N}, - {StrideE, 1}, - {1, 0}, - {1, 0}, - {0, 1}, - {0, 1}, - {StrideE, 1}, - NormalizeFunctor{}); + std::array xyLengths = {M, N}; + std::array xyStrides = {StrideE, 1}; - if(!normalize.IsSupportedArgument(normalize_argument)) + auto normalize = DeviceNormalizeInstance{}; + auto normalize_invoker = normalize.MakeInvoker(); + auto normalize_argument_ptr = + normalize.MakeArgumentPointer(xyLengths, + {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}}, + {xyStrides}, + input, + output, + NormalizeFunctor{}); + + if(!normalize.IsSupportedArgument(normalize_argument_ptr.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "Device5AryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device, exiting!"); } // run kernel gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false}); - normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false}); bool pass = true; { @@ -396,7 +386,7 @@ int main() float gemm_reduce_mean_reduce_square_mean_ave_time = gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel}); float normalize_ave_time = - normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel}); if(time_kernel) DumpGemmLayerNormPerf; // scalarPerVector: LayerNorm_out +using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // x(gemm_out), mean, + // meansquare, + // gamma, beta + ck::Tuple, // y + NormalizeFunctor, + 2, + 8, // MPerthread + ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta + ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { return HostTensorDescriptor(std::vector({len}), @@ -139,7 +136,6 @@ void host_gemm_layernorm(Tensor& out_m_n, int M, int N) { - int StrideE = N; Tensor e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{})); Tensor mean_m(f_host_tensor_descriptor1d(M, 1)); @@ -184,14 +180,9 @@ void host_gemm_layernorm(Tensor& out_m_n, { for(int n = 0; n < N; ++n) { - NormalizeComputeDataType out_acc = 0; - layerNormInst(out_acc, - ck::type_convert(e_m_n(m, n)), - ck::type_convert(mean_m(m)), - ck::type_convert(meanSquare_m(m)), - ck::type_convert(gamma_n(n)), - ck::type_convert(beta_n(n))); - out_m_n(m, n) = ck::type_convert(out_acc); + LayerNormOutDataType out_val = 0; + layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n)); + out_m_n(m, n) = out_val; } } } @@ -314,28 +305,28 @@ int main() beta_device_buf.GetDeviceBuffer()}; std::array output = {layerNorm_device_buf.GetDeviceBuffer()}; - auto normalize = DeviceNormalizeInstance{}; - auto normalize_invoker = normalize.MakeInvoker(); - auto normalize_argument = normalize.MakeArgument(input, - output, - {M, N}, - {StrideE, 1}, - {1, 0}, - {1, 0}, - {0, 1}, - {0, 1}, - {StrideE, 1}, - NormalizeFunctor{}); + std::array xyLengths = {M, N}; + std::array xyStrides = {StrideE, 1}; - if(!normalize.IsSupportedArgument(normalize_argument)) + auto normalize = DeviceNormalizeInstance{}; + auto normalize_invoker = normalize.MakeInvoker(); + auto normalize_argument_ptr = + normalize.MakeArgumentPointer(xyLengths, + {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}}, + {xyStrides}, + input, + output, + NormalizeFunctor{}); + + if(!normalize.IsSupportedArgument(normalize_argument_ptr.get())) { - throw std::runtime_error("The runtime parameters seems not supported by the " - "Device5AryElementwise instance, exiting!"); + throw std::runtime_error( + "The runtime parameters seems not supported by the device, exiting"); } // run kernel gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false}); - normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false}); bool pass = true; { @@ -369,7 +360,7 @@ int main() float gemm_reduce_mean_reduce_square_mean_ave_time = gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel}); float normalize_ave_time = - normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel}); if(time_kernel) DumpGemmLayerNormPerf : input 4-d tensor lengths +# -v : verification (0=no, 1=yes) +#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg2: time kernel (0=no, 1=yes) +./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1 +``` + +Result +``` +./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1 +launch_and_time_kernel: grid_dim {150, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1> +``` + +## Run ```example_dual_reduce_threadwise``` +```bash +# -D : input 4-d tensor lengths +# -v : verification (0=no, 1=yes) +#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg2: time kernel (0=no, 1=yes) +./bin/example_dual_reduce_multiblock -D 8000,4,4,4 -v 1 2 1 +``` + +Result +``` +./bin/example_dual_reduce_threadwise -D 8000,4,4,4 -v 1 2 1 +launch_and_time_kernel: grid_dim {32, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1> +``` diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp new file mode 100644 index 0000000000..9de98b71ce --- /dev/null +++ b/example/33_multiple_reduce/dual_reduce_common.hpp @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + private: + int option_index = 0; + + public: + std::vector inLengths = {600, 28, 28, 256}; + size_t n, h, w, c; + + bool do_verification = true; + int init_method = 2; + bool time_kernel = true; + + public: + SimpleAppArgs() + { + n = inLengths[0]; + h = inLengths[1]; + w = inLengths[2]; + c = inLengths[3]; + }; + + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " + "comparing with the host-based reduction" + << std::endl; + std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + if(inLengths.size() != 4) + throw std::runtime_error( + "Invalid option format! The number of integers is incorrect!"); + + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind])); + + n = inLengths[0]; + h = inLengths[1]; + w = inLengths[2]; + c = inLengths[3]; + + return (0); + }; +}; + +template +static void mean_meansquare_host(const Tensor& in, + Tensor& mean_ref, + Tensor& meansquare_ref, + size_t n, + size_t h, + size_t w, + size_t c) + +{ + auto thread_reduce_func = [&](auto iN) { + AccDataType mean = ck::type_convert(0.0f); + AccDataType meansquare = ck::type_convert(0.0f); + + // compute mean, meanquare, variance, invVariance + for(std::size_t iH = 0; iH < h; iH++) + { + for(std::size_t iW = 0; iW < w; iW++) + { + for(std::size_t iC = 0; iC < c; iC++) + { + AccDataType curr_value = ck::type_convert(in(iN, iH, iW, iC)); + + mean += curr_value; + meansquare += curr_value * curr_value; + }; + } + }; + + mean = mean / (h * w * c); + meansquare = meansquare / (h * w * c); + + mean_ref(iN) = ck::type_convert(mean); + meansquare_ref(iN) = ck::type_convert(meansquare); + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = (n + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; it++) + { + std::size_t iN_begin = it * work_per_thread; + std::size_t iN_end = std::min(static_cast((it + 1) * work_per_thread), n); + + auto f = [=] { + for(std::size_t iN = iN_begin; iN < iN_end; iN++) + { + thread_reduce_func(iN); + } + }; + + threads[it] = joinable_thread(f); + } +}; + +using ReduceOperation = ck::reduce::Add; + +using InElementwiseOperation_Mean = ck::tensor_operation::element_wise::PassThrough; +using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide; + +using InElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnarySquare; +using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide; + +using InElementwiseOperationTuple = + ck::Tuple; +using AccElementwiseOperationTuple = + ck::Tuple; + +template +int mean_meansquare_dual_reduce_test(size_t n, + size_t h, + size_t w, + size_t c, + bool do_verification, + int init_method, + bool time_kernel, + const std::array reduceDims) +{ + const std::vector inLengths = {n, h, w, c}; + + Tensor in(inLengths); + + std::vector outLengths{n}; + + Tensor mean_ref(outLengths); + Tensor mean(outLengths); + Tensor meansquare_ref(outLengths); + Tensor meansquare(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = mean.mDesc.GetStrides(); + + size_t invariant_total_length = n; + size_t reduce_total_length = h * w * c; + + const AccDataType alpha = ck::type_convert(1.0f); + const AccDataType beta = ck::type_convert(0.0f); + + std::size_t num_thread = 1; + + if(do_verification) + { + switch(init_method) + { + case 0: break; + case 1: in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); break; + case 2: in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); break; + default: in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + } + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpaceSize()); + DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpaceSize()); + + in_dev.ToDevice(in.mData.data()); + + if(do_verification) + { + mean_meansquare_host( + in, mean_ref, meansquare_ref, n, h, w, c); + }; + + constexpr ck::index_t NumInputDim = Rank; + constexpr ck::index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1; + + std::array i_inLengths; + std::array i_inStrides; + std::array i_outLengths; + std::array i_outStrides; + + std::copy(inLengths.begin(), inLengths.end(), i_inLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), i_inStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), i_outLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), i_outStrides.begin()); + + auto dual_reduce_op = DeviceDualReduce{}; + + auto argument_ptr = dual_reduce_op.MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + {i_outStrides, i_outStrides}, + reduceDims, + {&alpha, &alpha}, + {&beta, &beta}, + in_dev.GetDeviceBuffer(), + {mean_dev.GetDeviceBuffer(), meansquare_dev.GetDeviceBuffer()}, + ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}), + ck::make_tuple( + AccElementwiseOperation_Mean{static_cast(reduce_total_length)}, + AccElementwiseOperation_Meansquare{static_cast(reduce_total_length)})); + + if(!dual_reduce_op.IsSupportedArgument(argument_ptr.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + return (-1); + }; + + std::string reduce_name = dual_reduce_op.GetTypeString(); + + auto invoker_ptr = dual_reduce_op.MakeInvokerPointer(); + + float avg_time = 0.0f; + + avg_time += invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + + 2 * invariant_total_length * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name + << std::endl; + + bool pass = true; + + if(do_verification) + { + mean_dev.FromDevice(mean.mData.data()); + meansquare_dev.FromDevice(meansquare.mData.data()); + pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData); + pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData); + }; + + return (pass ? 0 : 1); +} diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/example/33_multiple_reduce/dual_reduce_multiblock.cpp new file mode 100644 index 0000000000..638934ec06 --- /dev/null +++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "dual_reduce_common.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +using InDataType = ck::half_t; +using OutDataType = float; +using OutDataTypeTuple = Tuple; +using AccDataType = float; + +// for NHWC layer-norm calculation of mean and meansquare +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +constexpr bool PropagateNan = false; + +constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::Set; + +using DeviceDualReduce = DeviceMultipleReduceMultiBlock<2, + InDataType, + AccDataType, + OutDataTypeTuple, + Rank, + NumReduceDim, + ReduceOperation, + InElementwiseOperationTuple, + AccElementwiseOperationTuple, + OutMemoryDataOperation, + PropagateNan, + 256, + 4, + 64, + 1, + 1, + 1, // InSrcVectorDim + 1, + ck::Sequence<1, 1>>; + +int main(int argc, char* argv[]) +{ + int retval = 0; + + if(argc > 1) + { + SimpleAppArgs arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + std::array reduceDims = {1, 2, 3}; + + retval = mean_meansquare_dual_reduce_test(arg.n, + arg.h, + arg.w, + arg.c, + arg.do_verification, + arg.init_method, + arg.time_kernel, + reduceDims); + } + else + { + std::array reduceDims = {1, 2, 3}; + + retval = mean_meansquare_dual_reduce_test( + 600, 28, 28, 256, true, 2, true, reduceDims); + }; + + return (retval); +} diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/example/33_multiple_reduce/dual_reduce_threadwise.cpp new file mode 100644 index 0000000000..51b93ccaa1 --- /dev/null +++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "dual_reduce_common.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +using InDataType = ck::half_t; +using OutDataType = float; +using OutDataTypeTuple = Tuple; +using AccDataType = float; + +// for NHWC layer-norm calculation of mean and meansquare +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +constexpr bool PropagateNan = false; + +using DeviceDualReduce = DeviceMultipleReduceThreadWise<2, + InDataType, + AccDataType, + OutDataTypeTuple, + Rank, + NumReduceDim, + ReduceOperation, + InElementwiseOperationTuple, + AccElementwiseOperationTuple, + PropagateNan, + 256, + 1, + 4, + 1, // InSrcVectorDim + 2, + ck::Sequence<1, 1>>; + +int main(int argc, char* argv[]) +{ + int retval = 0; + + if(argc > 1) + { + SimpleAppArgs arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + std::array reduceDims = {1, 2, 3}; + + retval = mean_meansquare_dual_reduce_test(arg.n, + arg.h, + arg.w, + arg.c, + arg.do_verification, + arg.init_method, + arg.time_kernel, + reduceDims); + } + else + { + std::array reduceDims = {1, 2, 3}; + + retval = mean_meansquare_dual_reduce_test( + 8000, 4, 4, 4, true, 2, true, reduceDims); + }; + + return (retval); +} diff --git a/example/34_batchnorm/CMakeLists.txt b/example/34_batchnorm/CMakeLists.txt new file mode 100644 index 0000000000..827435fed8 --- /dev/null +++ b/example/34_batchnorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp) +add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp) diff --git a/example/34_batchnorm/README.md b/example/34_batchnorm/README.md new file mode 100644 index 0000000000..afee4ac670 --- /dev/null +++ b/example/34_batchnorm/README.md @@ -0,0 +1,56 @@ +# Instructions for ```batchnorm nhwc``` Example + +## Run ```batchnorm forward nhwc``` +```bash +# -D : input 4-d tensor lengths +# -v : verification (0=no, 1=yes) +#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64) +#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes) +#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes) +#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg5: time kernel (0=no, 1=yes) +./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1 +``` + +Result +``` +./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1 +launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 2.08231 ms, 354.519 GB/s +``` + +Result +``` +./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0 +echo $? +0 +``` + +## Run ```batchnorm infer nhwc``` +```bash +# -D : input 4-d tensor lengths +# -v : verification (0=no, 1=yes) +#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1 +``` + +Result +``` +./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1 +launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.28235 ms, 523.329 GB/s +``` + + diff --git a/example/34_batchnorm/batchnorm_common.hpp b/example/34_batchnorm/batchnorm_common.hpp new file mode 100644 index 0000000000..6eac5dd838 --- /dev/null +++ b/example/34_batchnorm/batchnorm_common.hpp @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/utility/data_type.hpp" + +// binary operation used to calculate invVariance from mean and meansquare +struct InvVariance +{ + InvVariance(double epsilon) : epsilon_(epsilon){}; + + template + __host__ __device__ constexpr void operator()(T& y, const T& mean, const T& meansquare) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + using ck::math::sqrt; + + T tmp_epsilon = type_convert(epsilon_); + + y = meansquare - mean * mean; + y = 1.0f / sqrt(tmp_epsilon + y); + }; + + double epsilon_; +}; + +// (4-in, 2-out) element-wise operation used to update the moving average of mean and variance +struct MovingAverage +{ + MovingAverage(double factor) : factor_(factor){}; + + template + __host__ __device__ constexpr void operator()(T& y0, + T& y1, + const T& mean, + const T& runningMean, + const T& meansquare, + const T& runningVariance) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + + T tmp_factor = type_convert(factor_); + T variance = meansquare - mean * mean; + + y0 = runningMean * (type_convert(1.0f) - tmp_factor) + mean * tmp_factor; + y1 = runningVariance * (type_convert(1.0f) - tmp_factor) + variance * tmp_factor; + }; + + double factor_; +}; + +struct MovingAverageAndInvVariance +{ + MovingAverageAndInvVariance(double epsilon, double factor) + : epsilon_(epsilon), factor_(factor){}; + + template + __host__ __device__ constexpr void operator()(T& y0, // resultRunningMean + T& y1, // resultRunningVariance + T& y2, // saveInvVariance + const T& mean, + const T& runningMean, + const T& meansquare, + const T& runningVariance) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + using ck::math::sqrt; + + T tmp_epsilon = type_convert(epsilon_); + T tmp_factor = type_convert(factor_); + T variance = meansquare - mean * mean; + + y0 = runningMean * (type_convert(1.0f) - tmp_factor) + mean * tmp_factor; + y1 = runningVariance * (type_convert(1.0f) - tmp_factor) + variance * tmp_factor; + + y2 = 1.0f / sqrt(tmp_epsilon + variance); + }; + + double epsilon_; + double factor_; +}; + +struct NormalizeInInfer +{ + NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {} + + template + __host__ __device__ constexpr void operator()(T1& y, + const T1& x, + const T2& mean, + const T2& variance, + const T2& gamma, + const T2& beta) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + using ck::math::sqrt; + + T2 tmp_x, tmp_y; + + tmp_x = type_convert(x); + + tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert(epsilon_))) * gamma + beta; + y = type_convert(tmp_y); + }; + + double epsilon_; +}; + +struct NormalizeInForward +{ + NormalizeInForward(double epsilon = 1e-4) : epsilon_(epsilon) {} + + template + __host__ __device__ constexpr void operator()(T1& y, + const T1& x, + const T2& mean, + const T2& meansquare, + const T2& gamma, + const T2& beta) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + using ck::math::sqrt; + + T2 tmp_x, tmp_y; + T2 variance = meansquare - mean * mean; + + tmp_x = type_convert(x); + + tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert(epsilon_))) * gamma + beta; + y = type_convert(tmp_y); + }; + + double epsilon_; +}; + +template +static inline std::array +get_invariant_dims(const std::array& reduceDims) +{ + int reduceFlag = 0; + + // flag the bits for the reduceDims + for(int i = 0; i < NumReduceDim; i++) + { + reduceFlag |= 1 << reduceDims[i]; + }; + + std::array invariantDims; + + // collect invariant dimensions + int dim = 0; + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) == 0) + { + invariantDims[dim] = i; + dim++; + }; + + return invariantDims; +}; diff --git a/example/34_batchnorm/batchnorm_forward_impl.hpp b/example/34_batchnorm/batchnorm_forward_impl.hpp new file mode 100644 index 0000000000..c383c2a63a --- /dev/null +++ b/example/34_batchnorm/batchnorm_forward_impl.hpp @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" + +#include "batchnorm_common.hpp" + +template +int bnorm_fwd(bool time_kernel, + bool updateMovingAverage, + bool saveMeanAndInvVariance, + const std::array reduceDims, + const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* p_scale, + const void* p_bias, + void* p_y, + double exponentialAverageFactor, + void* p_runningMean, + void* p_runningVariance, + double epsilon, + void* p_saveMean, + void* p_saveInvVariance, + void* p_tmp_mean, + void* p_tmp_meansquare) +{ + static_assert(NumBatchNormReduceDim < Rank, + "Invalid number of reduced dimensions for batchnorm!"); + + constexpr ck::index_t NumScaleBiasMeanVarDim = Rank - NumBatchNormReduceDim; + + using InElementwiseOperation_Mean = ck::tensor_operation::element_wise::PassThrough; + using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide; + + using InElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnarySquare; + using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide; + + using DeviceMeanAndMeansquareInstance = + ck::tensor_operation::device::DeviceMultipleReduceMultiBlock< + 2, + InOutDataType, + AccDataType, + ck::Tuple, + Rank, + NumBatchNormReduceDim, + ck::reduce::Add, + ck::Tuple, + ck::Tuple, + ck::InMemoryDataOperationEnum::Set, + false, // PropagateNan + 256, + 16, + 16, + 1, + 1, + fastest_dim_is_reduced ? 1 : 0, + 1, + ck::Sequence<1, 1>>; + + using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // x, mean, + // meansquare, + // scale, bias + ck::Tuple, // y + NormalizeInForward, + Rank, + 2, // MPerthread + ck::Sequence<1, 1, 1, 1, 1>, // scalarPerVector: x, mean, meansquare, scale, bias + ck::Sequence<1>>; // scalarPerVector: y + + using DeviceInvVarianceInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // mean, meansquare + ck::Tuple, // invVariance + InvVariance, + NumScaleBiasMeanVarDim, + 2, // MPerthread + ck::Sequence<1, 1>, // scalarPerVector: mean, meansquare + ck::Sequence<1>>; // scalarPerVector: invVariance + + using DeviceMovingAverageInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // old moving mean, new mean, + // old moving variance, new + // meansquare + ck::Tuple, // updated moving mean, updated moving variance + MovingAverage, + NumScaleBiasMeanVarDim, + 4, // MPerthread + ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving + // variance, new meansquare + ck::Sequence<1, 1>>; // scalarPerVector: updated moving mean, updated moving variance + + using DeviceMovingAverageAndInvVarianceInstance = + ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // old moving mean, new + // mean, old moving + // variance, new + // meansquare + ck::Tuple, // updated moving mean, updated moving + // variancem, invVariance + MovingAverageAndInvVariance, + NumScaleBiasMeanVarDim, + 4, // MPerthread + ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving + // variance, new meansquare + ck::Sequence<1, 1, 1>>; // scalarPerVector: updated moving mean, updated moving variance + + auto invariantDims = get_invariant_dims(reduceDims); + std::array aligned_scaleBiasMeanVarStrides{0}; + + int i = 0; + for(auto dim : invariantDims) + { + assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]); + + aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i]; + i++; + }; + + int32_t reduceLength = 1; + + for(auto dim : reduceDims) + reduceLength *= xyLengths[dim]; + + int32_t invariantLength = 1; + + for(auto dim : invariantDims) + invariantLength *= xyLengths[dim]; + + size_t total_length = static_cast(invariantLength) * reduceLength; + + float avg_time = 0.0f; + std::size_t num_bytes = 0; + + auto dev_mean_and_meansquare = DeviceMeanAndMeansquareInstance{}; + + void* p_mean = saveMeanAndInvVariance ? p_saveMean : p_tmp_mean; + + const AccDataType alpha = ck::type_convert(1.0f); + const AccDataType beta = ck::type_convert(0.0f); + + auto argument_ptr1 = dev_mean_and_meansquare.MakeArgumentPointer( + xyLengths, + xStrides, + bnScaleBiasMeanVarLengths, + {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides}, + reduceDims, + {&alpha, &alpha}, + {&beta, &beta}, + p_x, + {p_mean, p_tmp_meansquare}, + ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}), + ck::make_tuple(AccElementwiseOperation_Mean{reduceLength}, + AccElementwiseOperation_Meansquare{reduceLength})); + + auto dev_normalize = DeviceNormalizeInstance{}; + + auto argument_ptr2 = + dev_normalize.MakeArgumentPointer(xyLengths, + {xStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {yStrides}, + {p_x, p_mean, p_tmp_meansquare, p_scale, p_bias}, + {p_y}, + NormalizeInForward{epsilon}); + + if(!dev_mean_and_meansquare.IsSupportedArgument(argument_ptr1.get()) || + !dev_normalize.IsSupportedArgument(argument_ptr2.get())) + { + std::cout << "The runtime parameters seems not supported by the Devic, exiting!" + << std::endl; + + return (-1); + }; + + auto invoker_ptr1 = dev_mean_and_meansquare.MakeInvokerPointer(); + auto invoker_ptr2 = dev_normalize.MakeInvokerPointer(); + + avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel}); + avg_time += invoker_ptr2->Run(argument_ptr2.get(), StreamConfig{nullptr, time_kernel}); + + num_bytes += + (total_length * sizeof(InOutDataType) + invariantLength * 2 * sizeof(AccDataType)) + // No.1 + (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) + + total_length * sizeof(InOutDataType)); // No.2 + + if(saveMeanAndInvVariance && updateMovingAverage) + { + auto dev_moving_average_inv_variance = DeviceMovingAverageAndInvVarianceInstance{}; + + auto argument_ptr3 = dev_moving_average_inv_variance.MakeArgumentPointer( + bnScaleBiasMeanVarLengths, + {bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides}, + {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides}, + {p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance}, + {p_runningMean, p_runningVariance, p_saveInvVariance}, + MovingAverageAndInvVariance{epsilon, exponentialAverageFactor}); + + if(!dev_moving_average_inv_variance.IsSupportedArgument(argument_ptr3.get())) + { + std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl; + + return (-1); + }; + + auto invoker_ptr3 = dev_moving_average_inv_variance.MakeInvokerPointer(); + + avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel}); + + num_bytes += invariantLength * (4 + 3) * sizeof(AccDataType) * 2; // No.5 + } + else if(saveMeanAndInvVariance) + { + auto dev_inv_variance = DeviceInvVarianceInstance{}; + auto argument_ptr3 = dev_inv_variance.MakeArgumentPointer( + bnScaleBiasMeanVarLengths, + {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides}, + {bnScaleBiasMeanVarStrides}, + {p_mean, p_tmp_meansquare}, + {p_saveInvVariance}, + InvVariance{epsilon}); + + if(!dev_inv_variance.IsSupportedArgument(argument_ptr3.get())) + { + std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl; + + return (-1); + }; + + auto invoker_ptr3 = dev_inv_variance.MakeInvokerPointer(); + + avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel}); + + num_bytes += invariantLength * (2 + 1) * sizeof(AccDataType); + } + else if(updateMovingAverage) + { + auto dev_moving_average = DeviceMovingAverageInstance{}; + + auto argument_ptr3 = dev_moving_average.MakeArgumentPointer( + bnScaleBiasMeanVarLengths, + {bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides, + bnScaleBiasMeanVarStrides}, + {bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides}, + {p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance}, + {p_runningMean, p_runningVariance}, + MovingAverage{exponentialAverageFactor}); + + if(!dev_moving_average.IsSupportedArgument(argument_ptr3.get())) + { + std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl; + + return (-1); + }; + + auto invoker_ptr3 = dev_moving_average.MakeInvokerPointer(); + + avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel}); + + num_bytes += invariantLength * (4 + 2) * sizeof(AccDataType) * 2; // No.5 + }; + + if(time_kernel) + { + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + }; + + return (0); +}; diff --git a/example/34_batchnorm/batchnorm_forward_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_nhwc.cpp new file mode 100644 index 0000000000..0b916c838a --- /dev/null +++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp" + +#include "batchnorm_forward_impl.hpp" + +template +using ReferenceBatchNormFwdInstance = + ck::tensor_operation::host::ReferenceBatchNormFwd_Input_N_H_W_C_Output_C; + +static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class BatchNormFwdArg +{ + private: + int option_index = 0; + + public: + std::vector inOutLengths; + + bool do_verification = false; + + bool updateMovingAverage; + bool saveMeanAndInvVariance; + + int data_type = 0; + int init_method = 2; + bool time_kernel = false; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension " + "lengths, must have 4 integers for nhwc" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization " + "result by " + "comparing with the host-based batch-normalization" + << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance " + "(0=no, 1=yes)" + << std::endl; + std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance " + "(0=no, 1=yes)" + << std::endl; + std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer " + "value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:v:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inOutLengths = getTypeValuesFromString(optarg); + + if(inOutLengths.size() != 4) + throw std::runtime_error( + "NHWC tensor layout should have 4 length values specified!"); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 5 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + updateMovingAverage = std::atoi(argv[optind++]); + saveMeanAndInvVariance = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind])); + + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6) + return (-1); + + return (0); + }; +}; + +using namespace ck; + +template +bool bnorm_fwd_nhwc_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector inOutLengths, + bool updateMovingAverage, + bool saveMeanAndInvVariance, + double averageFactor, + double epsilon) +{ + // for NHWC BatchNorm calculation of mean and meansquare + constexpr int Rank = 4; + constexpr int NumReduceDim = 3; + + const std::vector scaleBiasMeanVarLengths = {inOutLengths[3]}; + + // input data of the batchnorm forward algorithm + Tensor x(inOutLengths); + Tensor bnScale(scaleBiasMeanVarLengths); + Tensor bnBias(scaleBiasMeanVarLengths); + + // output data of the batchnorm forward algorithm + Tensor y_ref(inOutLengths); + Tensor y(inOutLengths); + + Tensor resultSaveMean_ref(scaleBiasMeanVarLengths); + Tensor resultSaveInvVariance_ref(scaleBiasMeanVarLengths); + + Tensor resultRunningMean_ref(scaleBiasMeanVarLengths); + Tensor resultRunningVariance_ref(scaleBiasMeanVarLengths); + + auto inOutStrides = x.mDesc.GetStrides(); + auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides(); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(updateMovingAverage) + { + if constexpr(std::is_same::value) + { + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + + const float x_mean = 0.0f; + const float x_stddev = 2.5f; + const float noise_stddev = 0.04f; + + resultRunningMean_ref.GenerateTensorValue( + GeneratorTensor_4{x_mean, noise_stddev}, num_thread); + + resultRunningVariance_ref.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + } + else + { + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.04f; + + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + + // initialize the runningMean to be values with tiny variation to the mean of the x + // values + resultRunningMean_ref.GenerateTensorValue( + GeneratorTensor_4{x_mean, noise_stddev}, num_thread); + + // initialize the runningVariance to be values with tiny variation to the variance of + // the x values + resultRunningVariance_ref.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + }; + } + else + { + if constexpr(std::is_same::value) + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + else + x.GenerateTensorValue(GeneratorTensor_3{-5.0f, 5.0f}, num_thread); + }; + + if(do_verification) + { + switch(init_method) + { + case 0: + bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); + break; + case 2: + bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + bnScale.GenerateTensorValue(GeneratorTensor_3{-5.0f, 5.0f}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_3{-5.0f, 5.0f}, num_thread); + } + }; + + // these buffers are usually provided by the user application + DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize()); + DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize()); + DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize()); + + // mean_dev or resultSaveMean_dev + DeviceMem resultSaveMean_dev(sizeof(AccDataType) * + resultSaveMean_ref.mDesc.GetElementSpaceSize()); + // meansquare_dev or resultSaveInvVariance_dev + DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) * + resultSaveInvVariance_ref.mDesc.GetElementSpaceSize()); + // resultRunningMean_dev + DeviceMem resultRunningMean_dev(sizeof(AccDataType) * + resultRunningMean_ref.mDesc.GetElementSpaceSize()); + // resultRunningVariance_dev + DeviceMem resultRunningVariance_dev(sizeof(AccDataType) * + resultRunningVariance_ref.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + bnScale_dev.ToDevice(bnScale.mData.data()); + bnBias_dev.ToDevice(bnBias.mData.data()); + + if(updateMovingAverage) + { + resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data()); + resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data()); + }; + + std::array i_inOutLengths; + std::array i_inOutStrides; + std::array i_scaleBiasMeanVarLengths; + std::array i_scaleBiasMeanVarStrides; + + std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin()); + std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin()); + std::copy(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + i_scaleBiasMeanVarLengths.begin()); + std::copy(scaleBiasMeanVarStrides.begin(), + scaleBiasMeanVarStrides.end(), + i_scaleBiasMeanVarStrides.begin()); + + int result = 0; + + // used for saving meansquare + DeviceMem workspace(sizeof(AccDataType) * 2 * resultSaveMean_ref.mDesc.GetElementSpaceSize() + + 128); + + void* p_tmp_mean = workspace.GetDeviceBuffer(); + void* p_tmp_meansquare = + static_cast(p_tmp_mean) + + (sizeof(AccDataType) * resultSaveMean_ref.mDesc.GetElementSpaceSize() + 63) / 64 * 64; + + result = bnorm_fwd( + time_kernel, + updateMovingAverage, + saveMeanAndInvVariance, + {0, 1, 2}, + i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + x_dev.GetDeviceBuffer(), + bnScale_dev.GetDeviceBuffer(), + bnBias_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + averageFactor, + updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr, + updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr, + epsilon, + saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr, + saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr, + p_tmp_mean, + p_tmp_meansquare); + + if(result < 0) + return (false); + + bool pass = true; + + if(do_verification) + { + auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{}; + + auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer( + i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + x.mData.data(), + bnScale.mData.data(), + bnBias.mData.data(), + y_ref.mData.data(), + 0.1, // exponentialAverageFactor + updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr, // resultRunningMean + updateMovingAverage ? resultRunningVariance_ref.mData.data() + : nullptr, // resultRunningVariance + epsilon, + saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr, + saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr); + + if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout + << "The runtime parameters seems not supported by the BatchNorm instance, exiting!" + << std::endl; + return (-2); + }; + + auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + + y_dev.FromDevice(y.mData.data()); + pass = pass && ck::utils::check_err(y.mData, y_ref.mData); + + if(updateMovingAverage) + { + Tensor resultRunningMean(scaleBiasMeanVarLengths); + Tensor resultRunningVariance(scaleBiasMeanVarLengths); + + resultRunningMean_dev.FromDevice(resultRunningMean.mData.data()); + resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data()); + + pass = + pass && ck::utils::check_err(resultRunningMean.mData, resultRunningMean_ref.mData); + pass = pass && ck::utils::check_err(resultRunningVariance.mData, + resultRunningVariance_ref.mData); + }; + + if(saveMeanAndInvVariance) + { + Tensor resultSaveMean(scaleBiasMeanVarLengths); + Tensor resultSaveInvVariance(scaleBiasMeanVarLengths); + + resultSaveMean_dev.FromDevice(resultSaveMean.mData.data()); + resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data()); + + pass = pass && ck::utils::check_err(resultSaveMean.mData, resultSaveMean_ref.mData); + pass = pass && ck::utils::check_err(resultSaveInvVariance.mData, + resultSaveInvVariance_ref.mData); + }; + }; + + return (pass); +}; + +const double epsilon = std::numeric_limits::epsilon(); +static const double averageFactor = 0.1; + +int main(int argc, char* argv[]) +{ + bool pass = true; + + if(argc > 1) + { + BatchNormFwdArg arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + if(arg.data_type == 0) + { + pass = bnorm_fwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.updateMovingAverage, + arg.saveMeanAndInvVariance, + averageFactor, + epsilon); + } + else if(arg.data_type == 1) + { + pass = bnorm_fwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.updateMovingAverage, + arg.saveMeanAndInvVariance, + averageFactor, + epsilon); + } + else if(arg.data_type == 3) + { + pass = bnorm_fwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.updateMovingAverage, + arg.saveMeanAndInvVariance, + averageFactor, + epsilon); + } + else if(arg.data_type == 5) + { + pass = bnorm_fwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.updateMovingAverage, + arg.saveMeanAndInvVariance, + averageFactor, + epsilon); + } + else if(arg.data_type == 6) + { + pass = bnorm_fwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.updateMovingAverage, + arg.saveMeanAndInvVariance, + averageFactor, + epsilon); + } + } + else + { + pass = bnorm_fwd_nhwc_test(true, + 2, + false, // don't time kernel + {128, 16, 16, 1024}, + true, + false, + averageFactor, + epsilon); + }; + + return (pass ? 0 : 1); +} diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp new file mode 100644 index 0000000000..d1164d0ff1 --- /dev/null +++ b/example/34_batchnorm/batchnorm_infer_impl.hpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" + +#include "batchnorm_common.hpp" + +template +int bnorm_infer( + bool time_kernel, + const std::array reduceDims, + const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* p_scale, + const void* p_bias, + double epsilon, + const void* p_estimatedMean, + const void* p_estimatedVariance, + void* p_y) +{ + (void)bnScaleBiasMeanVarLengths; + + static_assert(NumBatchNormReduceDim < Rank, + "Invalid number of reduced dimensions for batchnorm!"); + + using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, // x, mean, + // variance, + // scale, + // bias, + ck::Tuple, // y + NormalizeInInfer, + Rank, + 2, // MPerthread + ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias + ck::Sequence<1>>; // scalarPerVector: y + + auto invariantDims = get_invariant_dims(reduceDims); + std::array aligned_scaleBiasMeanVarStrides{0}; + + int i = 0; + for(auto dim : invariantDims) + { + assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]); + + aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i]; + i++; + }; + + int32_t reduceLength = 1; + + for(auto dim : reduceDims) + reduceLength *= xyLengths[dim]; + + int32_t invariantLength = 1; + + for(auto dim : invariantDims) + invariantLength *= xyLengths[dim]; + + size_t total_length = static_cast(invariantLength) * reduceLength; + + float avg_time = 0.0f; + std::size_t num_bytes = 0; + + auto dev_normalize = DeviceNormalizeInstance{}; + + auto argument_ptr1 = dev_normalize.MakeArgumentPointer( + xyLengths, + {xStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {yStrides}, + {p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias}, + {p_y}, + NormalizeInInfer{epsilon}); + + if(!dev_normalize.IsSupportedArgument(argument_ptr1.get())) + { + std::cout << "The runtime parameters seems not supported by the Devic, exiting!" + << std::endl; + + return (-1); + }; + + auto invoker_ptr1 = dev_normalize.MakeInvokerPointer(); + + avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel}); + + num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) + + total_length * sizeof(InOutDataType)); + + if(time_kernel) + { + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + }; + + return (0); +}; diff --git a/example/34_batchnorm/batchnorm_infer_nhwc.cpp b/example/34_batchnorm/batchnorm_infer_nhwc.cpp new file mode 100644 index 0000000000..247fae6d30 --- /dev/null +++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp" + +#include "batchnorm_infer_impl.hpp" + +template +using ReferenceBatchNormInferInstance = + ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C; + +static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class BatchNormInferArg +{ + private: + int option_index = 0; + + public: + std::vector inOutLengths; + + bool do_verification = false; + + int data_type = 0; + int init_method = 2; + bool time_kernel = false; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension " + "lengths, must have 4 integers for nhwc" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization " + "result by " + "comparing with the host-based batch-normalization" + << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer " + "value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:v:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inOutLengths = getTypeValuesFromString(optarg); + + if(inOutLengths.size() != 4) + throw std::runtime_error( + "NHWC tensor layout should have 4 length values specified!"); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 3 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind])); + + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6) + return (-1); + + return (0); + }; +}; + +using namespace ck; + +template +bool bnorm_infer_nhwc_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector inOutLengths, + double epsilon) +{ + // for NHWC BatchNorm calculation of mean and meansquare + constexpr int Rank = 4; + constexpr int NumReduceDim = 3; + + const std::vector scaleBiasMeanVarLengths = {inOutLengths[3]}; + + // input data of the batchnorm forward algorithm + Tensor x(inOutLengths); + Tensor bnScale(scaleBiasMeanVarLengths); + Tensor bnBias(scaleBiasMeanVarLengths); + + // output data of the batchnorm forward algorithm + Tensor y_ref(inOutLengths); + Tensor y(inOutLengths); + + Tensor estimatedMean(scaleBiasMeanVarLengths); + Tensor estimatedVariance(scaleBiasMeanVarLengths); + + auto inOutStrides = x.mDesc.GetStrides(); + auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides(); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if constexpr(std::is_same::value) + { + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + + const float x_mean = 0.0f; + const float x_stddev = 2.5f; + const float noise_stddev = 0.0001f; + + estimatedMean.GenerateTensorValue(GeneratorTensor_4{x_mean, noise_stddev}, + num_thread); + + estimatedVariance.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + } + else + { + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.0001f; + + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + + // initialize the savedMean to be values with tiny variation to the mean of the x values + estimatedMean.GenerateTensorValue(GeneratorTensor_4{x_mean, noise_stddev}, + num_thread); + + // initialize the variance to be values with tiny variation to the variance of the x values + estimatedVariance.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + }; + + if(do_verification) + { + switch(init_method) + { + case 0: + bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); + break; + case 2: + bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + bnScale.GenerateTensorValue(GeneratorTensor_3{-5.0f, 5.0f}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_3{-5.0f, 5.0f}, num_thread); + } + }; + + // these buffers are usually provided by the user application + DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize()); + DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize()); + DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize()); + + // mean_dev or resultSaveMean_dev + DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize()); + // meansquare_dev or resultSaveInvVariance_dev + DeviceMem estimatedVariance_dev(sizeof(AccDataType) * + estimatedVariance.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + bnScale_dev.ToDevice(bnScale.mData.data()); + bnBias_dev.ToDevice(bnBias.mData.data()); + estimatedMean_dev.ToDevice(estimatedMean.mData.data()); + estimatedVariance_dev.ToDevice(estimatedVariance.mData.data()); + + using ck::index_t; + + std::array i_inOutLengths; + std::array i_inOutStrides; + std::array i_scaleBiasMeanVarLengths; + std::array i_scaleBiasMeanVarStrides; + + std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin()); + std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin()); + std::copy(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + i_scaleBiasMeanVarLengths.begin()); + std::copy(scaleBiasMeanVarStrides.begin(), + scaleBiasMeanVarStrides.end(), + i_scaleBiasMeanVarStrides.begin()); + + int result = 0; + + result = bnorm_infer( + time_kernel, + {0, 1, 2}, + i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + x_dev.GetDeviceBuffer(), + bnScale_dev.GetDeviceBuffer(), + bnBias_dev.GetDeviceBuffer(), + epsilon, + estimatedMean_dev.GetDeviceBuffer(), + estimatedVariance_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer()); + + if(result < 0) + return (false); + + bool pass = true; + + if(do_verification) + { + auto batchNormInfer_ref = ReferenceBatchNormInferInstance{}; + + auto argument_ptr_ref = + batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + x.mData.data(), + bnScale.mData.data(), + bnBias.mData.data(), + epsilon, + estimatedMean.mData.data(), + estimatedVariance.mData.data(), + y_ref.mData.data()); + + if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout + << "The runtime parameters seems not supported by the BatchNorm instance, exiting!" + << std::endl; + return (-2); + }; + + auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + + y_dev.FromDevice(y.mData.data()); + pass = pass && ck::utils::check_err(y.mData, y_ref.mData); + }; + + return (pass); +}; + +static const double epsilon = std::numeric_limits::epsilon(); + +int main(int argc, char* argv[]) +{ + bool pass = true; + + if(argc > 1) + { + BatchNormInferArg arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + if(arg.data_type == 0) + { + pass = bnorm_infer_nhwc_test( + arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon); + } + else if(arg.data_type == 1) + { + pass = bnorm_infer_nhwc_test( + arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon); + } + else if(arg.data_type == 3) + { + pass = bnorm_infer_nhwc_test( + arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon); + } + else if(arg.data_type == 5) + { + pass = bnorm_infer_nhwc_test( + arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon); + } + else if(arg.data_type == 6) + { + pass = bnorm_infer_nhwc_test( + arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon); + }; + } + else + { + pass = bnorm_infer_nhwc_test(true, + 2, + false, // don't time kernel + {128, 16, 16, 1024}, + epsilon); + }; + + return (pass ? 0 : 1); +} diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 61b384497f..57cacecd26 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -47,3 +47,6 @@ add_subdirectory(29_batched_gemm_bias_e_permute) add_subdirectory(30_grouped_convnd_fwd_bias_relu_add) add_subdirectory(31_batched_gemm_gemm) add_subdirectory(32_batched_gemm_softmax_gemm) +add_subdirectory(33_multiple_reduce) +add_subdirectory(34_batchnorm) + diff --git a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp deleted file mode 100644 index bd8d7756d2..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp +++ /dev/null @@ -1,353 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include - -#include "ck/utility/common_header.hpp" -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct Device5AryElementwise : public DeviceElementwise<5, 1, NDim, ElementwiseFunctor> -{ - static constexpr auto I0 = Number<0>{}; - - template - static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) - { - const auto m = desc_m.GetLength(I0); - const index_t loop_step = gridSize * blockSize * MPerThread; - const auto pad = math::integer_least_multiple(m, loop_step) - m; - const auto desc_m_pad = - transform_tensor_descriptor(desc_m, - make_tuple(make_right_pad_transform(m, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m_pad; - } - - static auto MakeDescriptor_M(const std::vector& lengths, - const std::vector& stride, - index_t gridSize, - index_t blockSize) - { - auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - // merge nd to 1d desc - [s0 * s1 * ...] - if constexpr(NDim > 1) - { - const auto desc_m = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleOfShape)), - make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), - make_tuple(Sequence<0>{})); - - return PadDescriptor_M_1d(desc_m, gridSize, blockSize); - } - else - return PadDescriptor_M_1d(desc, gridSize, blockSize); - } - - using AGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using BGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using DGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using EGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using FGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - - using Gridwise5AryEltwise = Gridwise5AryElementwise_1D; - - struct Argument : public BaseArgument - { - Argument(const ADataType* p_a, - const BDataType* p_b, - const CDataType* p_c, - const DDataType* p_d, - const EDataType* p_e, - FDataType* p_f, - const std::vector& lengths, - const std::vector& a_strides, - const std::vector& b_strides, - const std::vector& c_strides, - const std::vector& d_strides, - const std::vector& e_strides, - const std::vector& f_strides, - ElementwiseFunctor functor) - : p_a_(p_a), - p_b_(p_b), - p_c_(p_c), - p_d_(p_d), - p_e_(p_e), - p_f_(p_f), - lengths_(lengths), - a_strides_(a_strides), - b_strides_(b_strides), - c_strides_(c_strides), - d_strides_(d_strides), - e_strides_(e_strides), - f_strides_(f_strides), - functor_(functor), - blockSize_(256), - gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future - { - a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_); - b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_); - c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_); - d_grid_desc_m_ = MakeDescriptor_M(lengths, d_strides, gridSize_, blockSize_); - e_grid_desc_m_ = MakeDescriptor_M(lengths, e_strides, gridSize_, blockSize_); - f_grid_desc_m_ = MakeDescriptor_M(lengths, f_strides, gridSize_, blockSize_); - } - - const ADataType* p_a_; - const BDataType* p_b_; - const CDataType* p_c_; - const DDataType* p_d_; - const EDataType* p_e_; - FDataType* p_f_; - std::vector lengths_; - AGridDesc_M a_grid_desc_m_; - BGridDesc_M b_grid_desc_m_; - CGridDesc_M c_grid_desc_m_; - DGridDesc_M d_grid_desc_m_; - EGridDesc_M e_grid_desc_m_; - FGridDesc_M f_grid_desc_m_; - std::vector a_strides_; - std::vector b_strides_; - std::vector c_strides_; - std::vector d_strides_; - std::vector e_strides_; - std::vector f_strides_; - ElementwiseFunctor functor_; - index_t blockSize_; - index_t gridSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto kernel = kernel_5ary_elementwise_1d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize_), - dim3(arg.blockSize_), - 0, - arg.p_a_, - arg.p_b_, - arg.p_c_, - arg.p_d_, - arg.p_e_, - arg.p_f_, - arg.a_grid_desc_m_, - arg.b_grid_desc_m_, - arg.c_grid_desc_m_, - arg.d_grid_desc_m_, - arg.e_grid_desc_m_, - arg.f_grid_desc_m_, - arg.functor_); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument& p_arg) { return IsSupportedArgument(&p_arg); } - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->lengths_.size() != NDim) - return false; - - if(pArg->lengths_.back() % MPerThread != 0) - return false; - - auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { - bool ret = true; - - if(!isLastDimensionCoalesced) - ret = scalarPerVector == 1; - else - ret = MPerThread % scalarPerVector == 0; - - return ret; - }; - - if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->d_strides_.back() == 1, DScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->e_strides_.back() == 1, EScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->f_strides_.back() == 1, FScalarPerVector)) - return false; - - return true; - }; - - static auto MakeArgument(std::array p_inputs, - std::array p_outputs, - std::vector lengths, - std::vector a_strides, - std::vector b_strides, - std::vector c_strides, - std::vector d_strides, - std::vector e_strides, - std::vector f_strides, - ElementwiseFunctor functor) - { - return Argument{static_cast(p_inputs[0]), - static_cast(p_inputs[1]), - static_cast(p_inputs[2]), - static_cast(p_inputs[3]), - static_cast(p_inputs[4]), - static_cast(p_outputs[0]), - lengths, - a_strides, - b_strides, - c_strides, - d_strides, - e_strides, - f_strides, - functor}; - } - - std::unique_ptr - MakeArgumentPointer(std::array p_inputs, - std::array p_outputs, - std::vector lengths, - std::vector> input_strides, - std::vector> output_strides, - ElementwiseFunctor functor) override - { - return std::make_unique(static_cast(p_inputs[0]), - static_cast(p_inputs[1]), - static_cast(p_inputs[2]), - static_cast(p_inputs[3]), - static_cast(p_inputs[4]), - static_cast(p_outputs[0]), - lengths, - input_strides[0], - input_strides[1], - input_strides[2], - input_strides[3], - input_strides[4], - output_strides[0], - functor); - } - - static auto MakeInvoker() { return Invoker{}; } - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - } - - // polymorphic - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "Device5aryElementwise" - << "<" - << "NDim = " << NDim - << "MPerThread = " << MPerThread - << "AScalarPerVector = " << AScalarPerVector - << "BScalarPerVector = " << BScalarPerVector - << "CScalarPerVector = " << CScalarPerVector - << "DScalarPerVector = " << DScalarPerVector - << "EScalarPerVector = " << EScalarPerVector - << "FScalarPerVector = " << FScalarPerVector - << ">"; - // clang-format on - - return str.str(); - } -}; // namespace device - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp new file mode 100644 index 0000000000..842ad5d459 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceBatchNormFwd : public BaseOperator +{ + virtual std::unique_ptr MakeArgumentPointer( + const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + void* p_y, + double exponentialAverageFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceBatchNormFwdPtr = std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp new file mode 100644 index 0000000000..785d64bf14 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceBatchNormInfer : public BaseOperator +{ + virtual std::unique_ptr MakeArgumentPointer( + const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + double epsilon, + const void* estimatedMean, + const void* estimatedInvVariance, + void* p_y) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceBatchNormInferPtr = std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp deleted file mode 100644 index ef2ab325a7..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ /dev/null @@ -1,247 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceBinaryElementwise : public DeviceElementwise<2, 1, NDim, ElementwiseFunctor> -{ - static constexpr auto I0 = Number<0>{}; - - template - static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) - { - const auto M = desc_m.GetLength(I0); - const index_t loop_step = gridSize * blockSize * MPerThread; - const auto pad = math::integer_least_multiple(M, loop_step) - M; - const auto desc_m_pad = - transform_tensor_descriptor(desc_m, - make_tuple(make_right_pad_transform(M, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m_pad; - } - - static auto MakeDescriptor_M(const std::vector& lengths, - const std::vector& strides, - index_t gridSize, - index_t blockSize) - { - auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - // merge nd to 1d desc - [s0 * s1 * ...] - if constexpr(NDim > 1) - { - const auto desc_m = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleOfShape)), - make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), - make_tuple(Sequence<0>{})); - - return PadDescriptor_M_1d(desc_m, gridSize, blockSize); - } - else - return PadDescriptor_M_1d(desc, gridSize, blockSize); - } - - using AGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using BGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); - using GridwiseBinEltwise = GridwiseBinaryElementwise_1D; - - struct Argument : public BaseArgument - { - Argument(const ADataType* p_a, - const BDataType* p_b, - CDataType* p_c, - const std::vector& lengths, - const std::vector& a_strides, - const std::vector& b_strides, - const std::vector& c_strides, - ElementwiseFunctor functor) - : p_a_(p_a), - p_b_(p_b), - p_c_(p_c), - lengths_(lengths), - a_strides_(a_strides), - b_strides_(b_strides), - c_strides_(c_strides), - functor_(functor), - blockSize_(256), - gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future - { - a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_); - b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_); - c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_); - } - - const ADataType* p_a_; - const BDataType* p_b_; - CDataType* p_c_; - std::vector lengths_; - AGridDesc_M a_grid_desc_m_; - BGridDesc_M b_grid_desc_m_; - CGridDesc_M c_grid_desc_m_; - std::vector a_strides_; - std::vector b_strides_; - std::vector c_strides_; - ElementwiseFunctor functor_; - index_t blockSize_; - index_t gridSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto kernel = kernel_binary_elementwise_1d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize_), - dim3(arg.blockSize_), - 0, - arg.p_a_, - arg.p_b_, - arg.p_c_, - arg.a_grid_desc_m_, - arg.b_grid_desc_m_, - arg.c_grid_desc_m_, - arg.functor_); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->lengths_.size() != NDim) - return false; - - if(pArg->lengths_.back() % MPerThread != 0) - return false; - - auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { - bool ret = true; - - if(!isLastDimensionCoalesced) - ret = scalarPerVector == 1; - else - ret = MPerThread % scalarPerVector == 0; - - return ret; - }; - - if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector)) - return false; - - if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector)) - return false; - - return true; - }; - - virtual std::unique_ptr - MakeArgumentPointer(std::array p_inputs, - std::array p_outputs, - std::vector lengths, - std::vector> input_strides, - std::vector> output_strides, - ElementwiseFunctor functor) override - { - return std::make_unique(static_cast(p_inputs[0]), - static_cast(p_inputs[1]), - static_cast(p_outputs[0]), - lengths, - input_strides[0], - input_strides[1], - output_strides[0], - functor); - } - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - } - - // polymorphic - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceBinaryElementwise" - << "<" - << "NDim = " << NDim - << "MPerThread = " << MPerThread - << "AScalarPerVector = " << AScalarPerVector - << "BScalarPerVector = " << BScalarPerVector - << "CScalarPerVector = " << CScalarPerVector - << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp index 4277499f99..29978458bb 100644 --- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp @@ -14,7 +14,7 @@ #include "ck/tensor_operation/gpu/device/device_cgemm.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -538,48 +538,43 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle float ave_time = 0; - using Add = ck::tensor_operation::element_wise::Add; - using Subtract = ck::tensor_operation::element_wise::Subtract; - using GridwiseBinAdd = GridwiseBinaryElementwise_1D; - using GridwiseBinSubtract = GridwiseBinaryElementwise_1D; - const auto add_kernel = kernel_binary_elementwise_1d; - const auto subtract_kernel = kernel_binary_elementwise_1d; + using Add = ck::tensor_operation::element_wise::Add; + using Subtract = ck::tensor_operation::element_wise::Subtract; + + using GridwiseBinAdd = + GridwiseElementwise_1D, + Tuple, + Tuple, + Tuple, + Add, + MPerThread, + Sequence, + Sequence>; + + using GridwiseBinSubtract = + GridwiseElementwise_1D, + Tuple, + Tuple, + Tuple, + Subtract, + MPerThread, + Sequence, + Sequence>; + + const auto add_kernel = kernel_elementwise_1d, + Tuple, + Tuple, + Tuple, + Add>; + + const auto subtract_kernel = + kernel_elementwise_1d, + Tuple, + Tuple, + Tuple, + Subtract>; if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) { @@ -631,18 +626,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.block_2_ctile_map_); // c_real = aux - aux_2 - ave_time += launch_and_time_kernel(stream_config, - subtract_kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_aux_grid_, - arg.p_aux_2_grid_, - arg.p_c_grid_real_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - Subtract{}); + ave_time += launch_and_time_kernel( + stream_config, + subtract_kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_), + make_tuple(arg.c_grid_desc_m_), + make_tuple(const_cast(arg.p_aux_grid_), + const_cast(arg.p_aux_2_grid_)), + make_tuple(arg.p_c_grid_real_), + Subtract{}); ave_time += launch_and_time_kernel(stream_config, @@ -679,18 +674,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.block_2_ctile_map_); // c_imag = aux + aux_2 - ave_time += launch_and_time_kernel(stream_config, - add_kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_aux_grid_, - arg.p_aux_2_grid_, - arg.p_c_grid_imag_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - Add{}); + ave_time += launch_and_time_kernel( + stream_config, + add_kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_), + make_tuple(arg.c_grid_desc_m_), + make_tuple(const_cast(arg.p_aux_grid_), + const_cast(arg.p_aux_2_grid_)), + make_tuple(arg.p_c_grid_imag_), + Add{}); } else { @@ -742,18 +737,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.block_2_ctile_map_); // c_real = aux - aux_2 - ave_time += launch_and_time_kernel(stream_config, - subtract_kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_aux_grid_, - arg.p_aux_2_grid_, - arg.p_c_grid_real_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - Subtract{}); + ave_time += launch_and_time_kernel( + stream_config, + subtract_kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_), + make_tuple(arg.c_grid_desc_m_), + make_tuple(const_cast(arg.p_aux_grid_), + const_cast(arg.p_aux_2_grid_)), + make_tuple(arg.p_c_grid_real_), + Subtract{}); ave_time += launch_and_time_kernel(stream_config, @@ -790,18 +785,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.block_2_ctile_map_); // c_imag = aux + aux_2 - ave_time += launch_and_time_kernel(stream_config, - add_kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_aux_grid_, - arg.p_aux_2_grid_, - arg.p_c_grid_imag_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - arg.c_grid_desc_m_, - Add{}); + ave_time += launch_and_time_kernel( + stream_config, + add_kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_), + make_tuple(arg.c_grid_desc_m_), + make_tuple(const_cast(arg.p_aux_grid_), + const_cast(arg.p_aux_2_grid_)), + make_tuple(arg.p_c_grid_imag_), + Add{}); } return ave_time; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index 9e860f6c40..0349480acc 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -13,7 +13,6 @@ #include "ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp index 50e6b538bd..7919ff633b 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp @@ -13,7 +13,6 @@ #include "ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp index f0946eb846..d0bf49f891 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp @@ -2,38 +2,286 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include -#include "device_base.hpp" +#include +#include + +#include "ck/utility/math.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" + +#include "ck/host_utility/kernel_launch.hpp" namespace ck { namespace tensor_operation { namespace device { -template -struct DeviceElementwise : public BaseOperator +template +struct DeviceElementwise + : public DeviceElementwiseBase { - virtual std::unique_ptr - MakeArgumentPointer(std::array p_inputs, - std::array p_outputs, - std::vector lengths, - std::vector> input_strides, - std::vector> output_strides, - ElementwiseFunctor functor) = 0; + static constexpr int NumInput = InDataTypeTuple::Size(); + static constexpr int NumOutput = OutDataTypeTuple::Size(); - virtual std::unique_ptr MakeInvokerPointer() = 0; -}; + static_assert(NumInput == InScalarPerVectorSeq::Size() && + NumOutput == OutScalarPerVectorSeq::Size(), + "Tuple size is inconsistent with the number of in/out!"); -template -using DeviceElementwisePtr = - std::unique_ptr>; + static auto GenerateInDataTypePointerTuple() + { + return generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + + return static_cast(nullptr); + }, + Number{}); + }; + + static auto GenerateOutDataTypePointerTuple() + { + return generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + + return static_cast(nullptr); + }, + Number{}); + }; + + using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); + using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); + + template + static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) + { + constexpr auto I0 = Number<0>{}; + + const auto m = desc_m.GetLength(I0); + const index_t loop_step = gridSize * blockSize * MPerThread; + const auto pad = math::integer_least_multiple(m, loop_step) - m; + const auto desc_m_pad = + transform_tensor_descriptor(desc_m, + make_tuple(make_right_pad_transform(m, pad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return desc_m_pad; + } + + static auto MakeDescriptor_M(const std::array& lengths, + const std::array& stride, + index_t gridSize, + index_t blockSize) + { + auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); + auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); + + // nd desc - [s0, s1, s2, ...] + const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); + + // merge nd to 1d desc - [s0 * s1 * ...] + if constexpr(NumDim > 1) + { + const auto desc_m = transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform(tupleOfShape)), + make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), + make_tuple(Sequence<0>{})); + + return PadDescriptor_M_1d(desc_m, gridSize, blockSize); + } + else + return PadDescriptor_M_1d(desc, gridSize, blockSize); + } + + template + static auto GenerateInOutGrid1dDescTuple(Number) + { + return generate_tuple( + [&](auto) { + if constexpr(NumDim > 1) + { + return MakeDescriptor_M({1, 1}, {1, 1}, 1, 1); + } + else + { + return MakeDescriptor_M({1}, {1}, 1, 1); + }; + }, + Number{}); + }; + + using InGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number{})); + using OutGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number{})); + + using GridwiseElementwise = GridwiseElementwise_1D; + + struct Argument : public BaseArgument + { + Argument(const std::array lengths, + const std::array, NumInput> inStridesArray, + const std::array, NumOutput> outStridesArray, + const std::array in_dev_buffers, + const std::array out_dev_buffers, + ElementwiseOperation elementwise_op) + + : lengths_(lengths), + inStridesArray_(inStridesArray), + outStridesArray_(outStridesArray), + elementwise_op_(elementwise_op), + blockSize_(256), + gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future + { + in_dev_buffers_ = generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + return static_cast(in_dev_buffers[I.value]); + }, + Number{}); + + out_dev_buffers_ = generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + return static_cast(out_dev_buffers[I.value]); + }, + Number{}); + + in_grid_1d_desc_tuple_ = generate_tuple( + [&](auto I) { + return MakeDescriptor_M( + lengths, inStridesArray[I.value], gridSize_, blockSize_); + }, + Number{}); + + out_grid_1d_desc_tuple_ = generate_tuple( + [&](auto I) { + return MakeDescriptor_M( + lengths, outStridesArray[I.value], gridSize_, blockSize_); + }, + Number{}); + } + + InDataTypePointerTuple in_dev_buffers_; + OutDataTypePointerTuple out_dev_buffers_; + InGrid1dDescTuple in_grid_1d_desc_tuple_; + OutGrid1dDescTuple out_grid_1d_desc_tuple_; + + std::array lengths_; + std::array, NumInput> inStridesArray_; + std::array, NumOutput> outStridesArray_; + + ElementwiseOperation elementwise_op_; + index_t blockSize_; + index_t gridSize_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto kernel = kernel_elementwise_1d; + + float elapsed_time = launch_and_time_kernel(stream_config, + kernel, + dim3(arg.gridSize_), + dim3(arg.blockSize_), + 0, + arg.in_grid_1d_desc_tuple_, + arg.out_grid_1d_desc_tuple_, + arg.in_dev_buffers_, + arg.out_dev_buffers_, + arg.elementwise_op_); + return elapsed_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if(pArg == nullptr) + return false; + + if(pArg->lengths_.back() % MPerThread != 0) + return false; + + auto IsScalarPerVectorValid = [&](const std::array& lengths, + const std::array& strides, + index_t scalarPerVector) { + if(strides.back() == 1 && lengths.back() % scalarPerVector == 0) + return true; + + if(strides.back() != 1 && scalarPerVector == 1) + return true; + + return false; + }; + + bool valid = true; + static_for<0, NumInput, 1>{}([&](auto I) { + if(!IsScalarPerVectorValid( + pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I))) + valid = false; + }); + + static_for<0, NumOutput, 1>{}([&](auto I) { + if(!IsScalarPerVectorValid( + pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I))) + valid = false; + }); + + return valid; + }; + + std::unique_ptr + MakeArgumentPointer(const std::array lengths, + const std::array, NumInput> inStridesArray, + const std::array, NumOutput> outStridesArray, + const std::array in_dev_buffers, + const std::array out_dev_buffers, + ElementwiseOperation elementwise_op) override + { + return std::make_unique(lengths, + inStridesArray, + outStridesArray, + in_dev_buffers, + out_dev_buffers, + elementwise_op); + } + + static auto MakeInvoker() { return Invoker{}; } + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; +}; // namespace device } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp new file mode 100644 index 0000000000..728faf543d --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceElementwiseBase : public BaseOperator +{ + static constexpr int NumInput = InDataTypeTuple::Size(); + static constexpr int NumOutput = OutDataTypeTuple::Size(); + + virtual std::unique_ptr + MakeArgumentPointer(const std::array lengths, + const std::array, NumInput> inStridesArray, + const std::array, NumOutput> outStridesArray, + const std::array in_dev_buffers, + const std::array out_dev_buffers, + ElementwiseOperation elementwise_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; // namespace device + +template +using DeviceElementwiseBasePtr = std::unique_ptr< + DeviceElementwiseBase>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp new file mode 100644 index 0000000000..93202e352e --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/utility/reduction_enums.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceMultipleReduce : public BaseOperator +{ + static constexpr index_t NumInputDim = Rank; + static constexpr index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1; + + virtual std::unique_ptr MakeArgumentPointer( + const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array, NumReduction> outStrides, + const std::array reduceDims, + const std::array alphas, + const std::array betas, + const void* in_dev, + const std::array out_dev_buffers, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceMultipleReducePtr = std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp new file mode 100644 index 0000000000..324d6c0d29 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp @@ -0,0 +1,595 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/sequence.hpp" +#include "ck/utility/reduction_operator.hpp" + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert(NumReduction == OutDataTypeTuple::Size() && + NumReduction == InElementwiseOperationTuple::Size() && + NumReduction == AccElementwiseOperationTuple::Size() && + NumReduction == OutDstVectorSizeSeq::Size(), + "All tuple should have the same size as the number of Reductions!"); + + static_assert(sequence_all_of(OutDstVectorSizeSeq{}, + [](auto vectorSize) { + return (MThreadSliceSize % vectorSize == 0); + }), + "The OutDstVectorSize should completely divide the MThreadSliceSize!"); + + static constexpr bool CheckDataTypeTuple() + { + bool flag = true; + + static_for<0, NumReduction, 1>{}([&](auto I) { + using OutDataType = remove_cvref_t; + flag = + flag && ck::reduce::InMemoryDataOperatonSupportedOnDataType::value; + }); + + return flag; + }; + + static_assert(CheckDataTypeTuple(), + "The OutDataType must support the specified OutMemoryDataOperation!"); + + static constexpr index_t NumInvariantDim = Rank - NumReduceDim; + + static constexpr index_t NumInputDim = Rank; + static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + // So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added + // later + static constexpr bool use_multiblock = + (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd); + + static_assert( + ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation), + "The reduction accumulation operation must be compatible with the OutMemoryDataOperation!"); + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto GenerateOutDataTypePointerTuple() + { + return generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + + return static_cast(nullptr); + }, + Number{}); + }; + + using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); + + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides, + int blkGroupSize, + int numBlockTileIteration) + { + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDim) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); + const auto invariantDimLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; + + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) + { + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); + + const auto outPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + static auto GenerateOutGrid1dDescTuple() + { + return generate_tuple( + [&](auto I) { + (void)I; + return MakeDst1dDescriptor(std::array{}, + std::array{}); + }, + Number{}); + }; + + using InGridDesc_M_K = decltype(MakeSrc2dDescriptor( + std::array{}, std::array{}, 1, 1)); + using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple()); + + static auto MakeDst1dDescriptorForBufferSet(const std::array& outLengths, + const std::array& outStrides) + { + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto length = out_grid_desc_m.GetLength(Number<0>{}); + + const auto pad = math::integer_least_multiple(length, BlockSize) - length; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(length, pad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + static auto GenerateOutGrid1dDescTuple_2() + { + return generate_tuple( + [&](auto I) { + (void)I; + return MakeDst1dDescriptorForBufferSet(std::array{}, + std::array{}); + }, + Number{}); + }; + + using OutGridDesc_M_Tuple_2 = decltype(GenerateOutGrid1dDescTuple_2()); + + struct Argument : public BaseArgument + { + Argument(const std::array& inLengths, + const std::array& inStrides, + const std::array& outLengths, + const std::array, NumReduction>& outStridesArray, + const std::array& reduceDims, + const std::array& alphas, + const std::array& betas, + const void* in_dev, + const std::array& out_dev_buffers, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple) + : outLengths_{outLengths}, + outStridesArray_{outStridesArray}, + in_elementwise_op_tuple_{in_elementwise_op_tuple}, + acc_elementwise_op_tuple_{acc_elementwise_op_tuple} + { + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); + + for(size_t i = 0; i < NumReduction; i++) + { + alpha_values_(i) = *static_cast(alphas[i]); + beta_values_(i) = *static_cast(betas[i]); + }; + + in_dev_ = static_cast(in_dev); + + out_dev_buffers_ = generate_tuple( + [&](auto iR) { + using OutDataTypePointer = + remove_cvref_t; + using OutDataType = remove_cvref_t>; + return static_cast(out_dev_buffers[iR]); + }, + Number{}); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths_); + + if constexpr(use_multiblock) + { + + int iterations = 1; + while(true) + { + int testBlkGroupSize = + (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + numBlockTileIteration = iterations; + } + else + { + blkGroupSize = 1; + numBlockTileIteration = + (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize; + }; + + in_grid_desc_m_k = + MakeSrc2dDescriptor(inLengths_, inStrides_, blkGroupSize, numBlockTileIteration); + + out_grid_desc_m_tuple = generate_tuple( + [&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); }, + Number{}); + + out_grid_desc_m_tuple_2 = generate_tuple( + [&](auto I) { + return MakeDst1dDescriptorForBufferSet(outLengths, outStridesArray[I]); + }, + Number{}); + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize * blkGroupSize; + + gridSize_pre = + math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize; + } + + std::array inLengths_; + std::array inStrides_; + + std::array outLengths_; + std::array, NumReduction> outStridesArray_; + + Array alpha_values_; + Array beta_values_; + + const InDataType* in_dev_; + OutDataTypePointerTuple out_dev_buffers_; + + InGridDesc_M_K in_grid_desc_m_k; + OutGridDesc_M_Tuple out_grid_desc_m_tuple; + OutGridDesc_M_Tuple_2 out_grid_desc_m_tuple_2; + + InElementwiseOperationTuple in_elementwise_op_tuple_; + AccElementwiseOperationTuple acc_elementwise_op_tuple_; + + long_index_t invariant_total_length; + long_index_t reduce_total_length; + + int blkGroupSize; + int numBlockTileIteration; + size_t gridSize; + + size_t gridSize_pre; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + using GridwiseMultipleReduce = + GridwiseMultipleReduction_mk_to_m_multiblock; + + const auto kernel_main = + kernel_multiple_reduce_multiblock; + + float avg_time = 0; + + if constexpr(use_multiblock) + { + auto identity_values = generate_tuple( + [&](auto iR) { + using OutDataType = remove_cvref_t; + return ck::reduce::GetIdentityValueForInMemoryDataOperation( + OutMemoryDataOperation); + }, + Number{}); + + const auto kernel_pre = kernel_multiple_buffer_set_value; + + avg_time += launch_and_time_kernel(stream_config, + kernel_pre, + dim3(arg.gridSize_pre), + dim3(BlockSize), + 0, + arg.out_grid_desc_m_tuple_2, + arg.out_dev_buffers_, + identity_values); + }; + + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.in_grid_desc_m_k, + arg.out_grid_desc_m_tuple, + arg.in_elementwise_op_tuple_, + arg.acc_elementwise_op_tuple_, + arg.blkGroupSize, + arg.numBlockTileIteration, + arg.alpha_values_, + arg.in_dev_, + arg.beta_values_, + arg.out_dev_buffers_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(use_multiblock) + { + for(size_t i = 0; i < pArg->beta_values_.Size(); i++) + if(pArg->beta_values_[i] != 0.0f) + return (false); + }; + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(NumInvariantDim == 0) + { + return (false); + } + else + { + if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1) + return (false); + + if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0) + return (false); + }; + } + else + { + if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1) + return (false); + + if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0) + return (false); + }; + // To improve + bool valid = true; + static_for<0, NumReduction, 1>{}([&](auto I) { + if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 && + OutDstVectorSizeSeq::At(I) != 1) + valid = false; + + if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0) + valid = false; + }); + + if(!valid) + return (false); + + if constexpr(use_multiblock) + { + // blkGroupSize of 1 should be handled by Blockwise path using + // InMemoryDataOperationEnum::Set + if(pArg->blkGroupSize == 1) + return (false); + + // This is very strong restriction, but needed to avoid some failure + if(pArg->outLengths_[NumOutputDim - 1] % M_BlockTileSize != 0) + return (false); + } + else + { + // cases with very small reduce_total_length should be handled by ThreadWise kernel + if(pArg->reduce_total_length / KThreadSliceSize < 2) + return (false); + }; + + return (true); + }; + + std::unique_ptr MakeArgumentPointer( + const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array, NumReduction> outStridesArray, + const std::array reduceDims, + const std::array alphas, + const std::array betas, + const void* in_dev, + const std::array out_dev_buffers, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStridesArray, + reduceDims, + alphas, + betas, + in_dev, + out_dev_buffers, + in_elementwise_op_tuple, + acc_elementwise_op_tuple); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceMultipleReduceBlockWise<" : "DeviceMultipleReduceMultiBlock<") << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ","; + str << "OutDstVectorSize"; + static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); }); + str << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp new file mode 100644 index 0000000000..328395ec1c --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/sequence.hpp" +#include "ck/utility/reduction_operator.hpp" + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + + static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert(NumReduction == OutDataTypeTuple::Size() && + NumReduction == InElementwiseOperationTuple::Size() && + NumReduction == AccElementwiseOperationTuple::Size() && + NumReduction == OutDstVectorSizeSeq::Size(), + "All tuple should have the same size as the number of Reductions!"); + + static_assert(sequence_all_of(OutDstVectorSizeSeq{}, + [](auto vectorSize) { + return (MThreadSliceSize % vectorSize == 0); + }), + "The OutDstVectorSize should completely divide the MThreadSliceSize!"); + + static constexpr index_t NumInvariantDim = Rank - NumReduceDim; + + static constexpr index_t NumInputDim = Rank; + static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize; + + static auto GenerateOutDataTypePointerTuple() + { + return generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + + return static_cast(nullptr); + }, + Number{}); + }; + + using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); + + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides) + { + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDim) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); + const auto invariantDimLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = + math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; + + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) + { + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); + + const auto outPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + static auto GenerateOutGrid1dDescTuple() + { + return generate_tuple( + [&](auto I) { + (void)I; + return MakeDst1dDescriptor(std::array{}, + std::array{}); + }, + Number{}); + }; + + using InGridDesc_M_K = decltype(MakeSrc2dDescriptor(std::array{}, + std::array{})); + using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple()); + + struct Argument : public BaseArgument + { + Argument(const std::array& inLengths, + const std::array& inStrides, + const std::array& outLengths, + const std::array, NumReduction>& outStridesArray, + const std::array& reduceDims, + const std::array& alphas, + const std::array& betas, + const void* in_dev, + const std::array& out_dev_buffers, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple) + : outLengths_{outLengths}, + outStridesArray_{outStridesArray}, + in_elementwise_op_tuple_{in_elementwise_op_tuple}, + acc_elementwise_op_tuple_{acc_elementwise_op_tuple} + { + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); + + for(size_t i = 0; i < NumReduction; i++) + { + alpha_values_(i) = *static_cast(alphas[i]); + beta_values_(i) = *static_cast(betas[i]); + }; + + in_dev_ = static_cast(in_dev); + + out_dev_buffers_ = generate_tuple( + [&](auto iR) { + using OutDataTypePointer = + remove_cvref_t; + using OutDataType = remove_cvref_t>; + return static_cast(out_dev_buffers[iR]); + }, + Number{}); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths_); + + in_grid_desc_m_k = MakeSrc2dDescriptor(inLengths_, inStrides_); + + out_grid_desc_m_tuple = generate_tuple( + [&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); }, + Number{}); + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize; + } + + std::array inLengths_; + std::array inStrides_; + + std::array outLengths_; + std::array, NumReduction> outStridesArray_; + + Array alpha_values_; + Array beta_values_; + + const InDataType* in_dev_; + OutDataTypePointerTuple out_dev_buffers_; + + InGridDesc_M_K in_grid_desc_m_k; + OutGridDesc_M_Tuple out_grid_desc_m_tuple; + + InElementwiseOperationTuple in_elementwise_op_tuple_; + AccElementwiseOperationTuple acc_elementwise_op_tuple_; + + long_index_t invariant_total_length; + long_index_t reduce_total_length; + + size_t gridSize; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + using GridwiseMultipleReduce = + GridwiseMultipleReduction_mk_to_m_threadwise; + + const auto kernel_main = + kernel_multiple_reduce_threadwise; + + float avg_time = 0; + + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.in_grid_desc_m_k, + arg.out_grid_desc_m_tuple, + arg.in_elementwise_op_tuple_, + arg.acc_elementwise_op_tuple_, + arg.alpha_values_, + arg.in_dev_, + arg.beta_values_, + arg.out_dev_buffers_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(NumInvariantDim == 0) + { + return (false); + } + else + { + if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1) + return (false); + + if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0) + return (false); + }; + } + else + { + if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1) + return (false); + + if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0) + return (false); + }; + + // To improve + bool valid = true; + static_for<0, NumReduction, 1>{}([&](auto I) { + if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 && + OutDstVectorSizeSeq::At(I) != 1) + valid = false; + + if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0) + valid = false; + }); + + if(!valid) + return (false); + + return (true); + }; + + std::unique_ptr MakeArgumentPointer( + const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array, NumReduction> outStridesArray, + const std::array reduceDims, + const std::array alphas, + const std::array betas, + const void* in_dev, + const std::array out_dev_buffers, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStridesArray, + reduceDims, + alphas, + betas, + in_dev, + out_dev_buffers, + in_elementwise_op_tuple, + acc_elementwise_op_tuple); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceMultipleReduceThreadwise<" << BlockSize << ","; + str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << 1 << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ","; + str << "OutDstVectorSize"; + static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); }); + str << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp index 42e74f2993..5dc051be3c 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp @@ -35,6 +35,25 @@ std::pair get_2d_lengths(const std::vector& return std::make_pair(invariant_total_length, reduce_total_length); }; +template +std::pair get_2d_lengths(const std::array& inLengths) +{ + static_assert(Rank <= 6, "bigger Rank size not supported!"); + + long_index_t invariant_total_length = 1; + long_index_t reduce_total_length = 1; + + constexpr int NumInvariantDim = Rank - NumReduceDim; + + for(int i = NumInvariantDim; i < Rank; i++) + reduce_total_length *= inLengths[i]; + + for(int i = 0; i < NumInvariantDim; i++) + invariant_total_length *= inLengths[i]; + + return std::make_pair(invariant_total_length, reduce_total_length); +}; + // helper functions using variadic template arguments template auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) @@ -85,6 +104,39 @@ std::vector shuffle_tensor_dimensions(const std::vector& origL return newLengthsStrides; }; +template +std::array +shuffle_tensor_dimensions(const std::array& origLengthsStrides, + const std::array& reduceDims) +{ + std::array newLengthsStrides; + + int reduceFlag = 0; + + // flag the bits for the reduceDims + for(int i = 0; i < NumReduceDim; i++) + { + reduceFlag |= 1 << reduceDims[i]; + }; + + // collect invariant dimensions + int pos = 0; + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) == 0) + { + newLengthsStrides[pos++] = origLengthsStrides[i]; + }; + + // collect reduce dimensions + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) > 0) + { + newLengthsStrides[pos++] = origLengthsStrides[i]; + }; + + return newLengthsStrides; +}; + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp deleted file mode 100644 index 0e67ede13c..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceUnaryElementwise : public BaseOperator -{ - static constexpr auto I0 = Number<0>{}; - - template - static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize) - { - const auto m0 = desc_m0.GetLength(I0); - const index_t loop_step = gridSize * blockSize * ScalarPerVector; - const auto pad = math::integer_least_multiple(m0, loop_step) - m0; - const auto desc_m0_pad = - transform_tensor_descriptor(desc_m0, - make_tuple(make_right_pad_transform(m0, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m0_pad; - } - - static auto MakeDescriptor_M0(const std::vector& shape, - const std::vector& stride, - index_t gridSize, - index_t blockSize) - { - auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - // merge nd to 1d desc - [s0 * s1 * ...] - if constexpr(Dim > 1) - { - const auto desc_m0 = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleOfShape)), - make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), - make_tuple(Sequence<0>{})); - - return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize); - } - else - return PadDescriptor_M0_1d(desc, gridSize, blockSize); - } - - using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1)); - using GridwiseUEltwise = GridwiseUnaryElementwise_1D; - - struct Argument : public BaseArgument - { - Argument(const ADataType* p_a, - BDataType* p_b, - const std::vector& shape, - const std::vector& stride_a, - const std::vector& stride_b, - ElementwiseFunctor functor) - : p_a_(p_a), - p_b_(p_b), - shape_(shape), - functor_(functor), - blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future - { - index_t tensor_size = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies{}); - gridSize_ = GridwiseUEltwise::CalculateGridSize(tensor_size); - a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_); - b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_); - } - - const ADataType* p_a_; - BDataType* p_b_; - std::vector shape_; - GridDesc_M0 a_grid_desc_m0_; - GridDesc_M0 b_grid_desc_m0_; - ElementwiseFunctor functor_; - index_t blockSize_; - index_t gridSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto kernel = kernel_unary_elementwise_1d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(arg.gridSize_), - dim3(arg.blockSize_), - 0, - arg.p_a_, - arg.p_b_, - arg.a_grid_desc_m0_, - arg.b_grid_desc_m0_, - arg.functor_); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->shape_.back() % ScalarPerVector != 0) - return false; - - return true; - }; - - std::unique_ptr MakeArgumentPointer(const void* p_a, - void* p_b, - std::vector shape, - std::vector stride_a, - std::vector stride_b, - ElementwiseFunctor functor) - { - return std::make_unique(static_cast(p_a), - static_cast(p_b), - shape, - stride_a, - stride_b, - functor); - } - - std::unique_ptr MakeInvokerPointer() { return std::make_unique(); } - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceBinaryElementwise" - << "<" - << "ScalarPerVector = " << ScalarPerVector - << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index f123fbaa3b..b69f5801f0 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -198,17 +198,44 @@ struct Normalize // FIXME: is double absolutely necessary? Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {} - template - __host__ __device__ constexpr void operator()( - T& y, const T& x, const T& mean, const T& mean_square, const T& gamma, const T& beta) const; + template + __host__ __device__ constexpr void operator()(T1& y, + const T1& x, + const T2& mean, + const T2& mean_square, + const T3& gamma, + const T3& beta) const; template <> - __host__ __device__ constexpr void operator()(float& y, - const float& x, - const float& mean, - const float& mean_square, - const float& gamma, - const float& beta) const + __host__ __device__ constexpr void operator()(half_t& y, + const half_t& x, + const float& mean, + const float& mean_square, + const half_t& gamma, + const half_t& beta) const + { + using ck::math::sqrt; + + float variance = mean_square - (mean * mean); + + float tmp_x = type_convert(x); + float tmp_gamma = type_convert(gamma); + float tmp_beta = type_convert(beta); + + float tmp_y = + ((tmp_x - mean) / sqrt(variance + type_convert(epsilon_))) * tmp_gamma + + tmp_beta; + + y = type_convert(tmp_y); + }; + + template <> + __host__ __device__ constexpr void operator()(float& y, + const float& x, + const float& mean, + const float& mean_square, + const float& gamma, + const float& beta) const { using ck::math::sqrt; @@ -217,12 +244,12 @@ struct Normalize }; template <> - __host__ __device__ constexpr void operator()(double& y, - const double& x, - const double& mean, - const double& mean_square, - const double& gamma, - const double& beta) const + __host__ __device__ constexpr void operator()(double& y, + const double& x, + const double& mean, + const double& mean_square, + const double& gamma, + const double& beta) const { using ck::math::sqrt; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp new file mode 100644 index 0000000000..bdebe3816f --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/reduction_common.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/utility/reduction_functions_accumulate.hpp" +#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" +#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void +kernel_multiple_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M_Tuple out_grid_desc_m_tuple, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple, + index_t block_group_size, + index_t num_k_block_tile_iteration, + Array alpha_values, + const InDataType* const __restrict__ p_in_value_global, + Array beta_values, + OutDataTypePointerTuple p_out_value_global_tuple) +{ + GridwiseMultipleReduction::Run(in_grid_desc_m_k, + out_grid_desc_m_tuple, + in_elementwise_op_tuple, + acc_elementwise_op_tuple, + block_group_size, + num_k_block_tile_iteration, + alpha_values, + p_in_value_global, + beta_values, + p_out_value_global_tuple); +}; + +template +struct GridwiseMultipleReduction_mk_to_m_multiblock +{ + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert(NumReduction == OutDataTypePointerTuple::Size() && + NumReduction == OutGridDesc_M_Tuple::Size() && + NumReduction == OutDstVectorSizeSeq::Size() && + NumReduction == InElementwiseOperationTuple::Size() && + NumReduction == AccElementwiseOperationTuple::Size(), + "All tuple should have the same size as the number of Reductions!"); + + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using BlockwiseReduce = PartitionedBlockwiseReduction; + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + using Accumulation = detail::AccumulateWithNanCheck; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M_Tuple& out_grid_desc_m_tuple, + const InElementwiseOperationTuple& in_elementwise_op_tuple, + const AccElementwiseOperationTuple& acc_elementwise_op_tuple, + index_t block_group_size, + index_t num_k_block_tile_iteration, + Array alpha_values, + const InDataType* const __restrict__ p_in_value_global, + Array beta_values, + OutDataTypePointerTuple p_out_value_global_tuple) + { + const auto identityVal = ReduceOperation::template GetIdentityValue(); + + // LDS, reused by all reductions + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; + + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); + auto out_global_val_buf_tuple = generate_tuple( + [&](auto iR) { + return make_dynamic_buffer( + p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize()); + }, + Number{}); + + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); + + StaticBuffer + in_thread_buf; + + auto in_thread_buf_tuple = generate_tuple( + [&](auto iR) { + (void)iR; + return StaticBuffer{}; + }, + Number{}); + + auto accu_value_buf_tuple = generate_tuple( + [&](auto iR) { + (void)iR; + return StaticBuffer{}; + }, + Number{}); + + static_for<0, NumReduction, 1>{}([&](auto iR) { + static_for<0, MThreadSliceSize, 1>{}( + [&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; }); + }); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / block_group_size; + const index_t block_local_id = block_global_id % block_group_size; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t reducedTiles = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, NumReduction, 1>{}([&](auto iR) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number{}), + in_thread_buf(Number{})); + }); + }); + + ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR)); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; + + static_for<0, NumReduction, 1>{}([&](auto iR) { + using OutDataTypePointer = remove_cvref_t; + using OutDataType = remove_cvref_t>; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf_tuple(iR)(I)); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I), + accu_value_buf_tuple(iR)(I)); + + accu_value_buf_tuple(iR)(I) *= alpha_values[iR]; + } + }); + + if(thread_k_cluster_id == 0) + { + if(block_group_size == 0 && !float_equal_zero{}(beta_values[iR])) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSizeSeq::At(iR), + 1, + false>( + out_grid_desc_m_tuple[iR], + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m_tuple[iR], + out_global_val_buf_tuple(iR), + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf_tuple(iR)(I) += + type_convert(priorDstValueBuf[I]) * beta_values[iR]; + }); + }; + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSizeSeq::At(iR), + OutMemoryDataOperation, + 1, + true>( + out_grid_desc_m_tuple[iR], + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf_tuple[iR], + out_grid_desc_m_tuple[iR], + out_global_val_buf_tuple(iR)); + }; + }); + }; +}; // namespace ck + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp new file mode 100644 index 0000000000..1313ec9435 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/reduction_common.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/utility/reduction_functions_accumulate.hpp" +#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" +#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void +kernel_multiple_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M_Tuple out_grid_desc_m_tuple, + const InElementwiseOperationTuple in_elementwise_op_tuple, + const AccElementwiseOperationTuple acc_elementwise_op_tuple, + Array alpha_values, + const InDataType* const __restrict__ p_in_value_global, + Array beta_values, + OutDataTypePointerTuple p_out_value_global_tuple) +{ + GridwiseMultipleReduction::Run(in_grid_desc_m_k, + out_grid_desc_m_tuple, + in_elementwise_op_tuple, + acc_elementwise_op_tuple, + alpha_values, + p_in_value_global, + beta_values, + p_out_value_global_tuple); +}; + +template +struct GridwiseMultipleReduction_mk_to_m_threadwise +{ + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert(NumReduction == OutDataTypePointerTuple::Size() && + NumReduction == OutGridDesc_M_Tuple::Size() && + NumReduction == OutDstVectorSizeSeq::Size() && + NumReduction == InElementwiseOperationTuple::Size() && + NumReduction == AccElementwiseOperationTuple::Size(), + "All tuple should have the same size as the number of Reductions!"); + + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + + using Accumulation = detail::AccumulateWithNanCheck; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M_Tuple& out_grid_desc_m_tuple, + const InElementwiseOperationTuple& in_elementwise_op_tuple, + const AccElementwiseOperationTuple& acc_elementwise_op_tuple, + Array alpha_values, + const InDataType* const __restrict__ p_in_value_global, + Array beta_values, + OutDataTypePointerTuple p_out_value_global_tuple) + { + const auto identityVal = ReduceOperation::template GetIdentityValue(); + + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); + auto out_global_val_buf_tuple = generate_tuple( + [&](auto iR) { + return make_dynamic_buffer( + p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize()); + }, + Number{}); + + StaticBuffer + in_thread_buf; + + auto in_thread_buf_tuple = generate_tuple( + [&](auto iR) { + (void)iR; + return StaticBuffer{}; + }, + Number{}); + + auto accu_value_buf_tuple = generate_tuple( + [&](auto iR) { + (void)iR; + return StaticBuffer{}; + }, + Number{}); + + static_for<0, NumReduction, 1>{}([&](auto iR) { + static_for<0, MThreadSliceSize, 1>{}( + [&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; }); + }); + + const index_t thread_global_1d_id = get_thread_global_1d_id(); + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2( + in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); + + index_t reducedLength = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, NumReduction, 1>{}([&](auto iR) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number{}), + in_thread_buf(Number{})); + }); + }); + + ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR)); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedLength += KThreadSliceSize; + } while(reducedLength < toReduceLength); + + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; + + static_for<0, NumReduction, 1>{}([&](auto iR) { + using OutDataTypePointer = remove_cvref_t; + using OutDataType = remove_cvref_t>; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I), + accu_value_buf_tuple(iR)(I)); + + accu_value_buf_tuple(iR)(I) *= alpha_values[iR]; + }); + + if(!float_equal_zero{}(beta_values[iR])) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSizeSeq::At(iR), + 1, + false>( + out_grid_desc_m_tuple[iR], + make_multi_index(thread_global_1d_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m_tuple[iR], + out_global_val_buf_tuple(iR), + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf_tuple(iR)(I) += + type_convert(priorDstValueBuf[I]) * beta_values[iR]; + }); + }; + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + OutDstVectorSizeSeq::At(iR), + OutMemoryDataOperation, + 1, + true>( + out_grid_desc_m_tuple[iR], + make_multi_index(thread_global_1d_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf_tuple[iR], + out_grid_desc_m_tuple[iR], + out_global_val_buf_tuple(iR)); + }); + }; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp deleted file mode 100644 index 2393734826..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -__global__ void kernel_5ary_elementwise_1d(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - const CDataType* __restrict__ p_c_global, - const DDataType* __restrict__ p_d_global, - const EDataType* __restrict__ p_e_global, - FDataType* __restrict__ p_f_global, - const AGridDesc_M a_grid_desc_m, - const BGridDesc_M b_grid_desc_m, - const CGridDesc_M c_grid_desc_m, - const DGridDesc_M d_grid_desc_m, - const EGridDesc_M e_grid_desc_m, - const FGridDesc_M f_grid_desc_m, - const ElementwiseFunctor functor) -{ - Gridwise5AryEltwise::Run(p_a_global, - p_b_global, - p_c_global, - p_d_global, - p_e_global, - p_f_global, - a_grid_desc_m, - b_grid_desc_m, - c_grid_desc_m, - d_grid_desc_m, - e_grid_desc_m, - f_grid_desc_m, - functor); -} - -// TODO - implement n-ary Elemenetwise_1D, tuple of inputs and tuple of outputs -template -struct Gridwise5AryElementwise_1D -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto thread_desc_m = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); - - using PassThrough = tensor_operation::element_wise::PassThrough; - - static __device__ auto CalculateElementwiseIndex() - { - const index_t global_thread_id = get_thread_global_1d_id(); - return make_multi_index(global_thread_id * MPerThread); - } - - __device__ static void Run(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - const CDataType* __restrict__ p_c_global, - const DDataType* __restrict__ p_d_global, - const EDataType* __restrict__ p_e_global, - FDataType* __restrict__ p_f_global, - const AGridDesc_M a_grid_desc_m, - const BGridDesc_M b_grid_desc_m, - const CGridDesc_M c_grid_desc_m, - const DGridDesc_M d_grid_desc_m, - const EGridDesc_M e_grid_desc_m, - const FGridDesc_M f_grid_desc_m, - const ElementwiseFunctor functor) - { - const auto a_global_buf = make_dynamic_buffer( - p_a_global, a_grid_desc_m.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( - p_b_global, b_grid_desc_m.GetElementSpaceSize()); - const auto c_global_buf = make_dynamic_buffer( - p_c_global, c_grid_desc_m.GetElementSpaceSize()); - const auto d_global_buf = make_dynamic_buffer( - p_d_global, d_grid_desc_m.GetElementSpaceSize()); - const auto e_global_buf = make_dynamic_buffer( - p_e_global, e_grid_desc_m.GetElementSpaceSize()); - auto f_global_buf = make_dynamic_buffer( - p_f_global, f_grid_desc_m.GetElementSpaceSize()); - - StaticBuffer a_thread_buf; - StaticBuffer b_thread_buf; - StaticBuffer c_thread_buf; - StaticBuffer d_thread_buf; - StaticBuffer e_thread_buf; - StaticBuffer f_thread_buf; - - const auto thread_store_global_offset = CalculateElementwiseIndex(); - - auto a_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - AScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{a_grid_desc_m, thread_store_global_offset}; - - auto b_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - BScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{b_grid_desc_m, thread_store_global_offset}; - - auto c_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - CScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{c_grid_desc_m, thread_store_global_offset}; - - auto d_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - DScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{d_grid_desc_m, thread_store_global_offset}; - - auto e_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - EScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{e_grid_desc_m, thread_store_global_offset}; - - auto f_global_write = - ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // DstVectorDim - FScalarPerVector, // ScalarPerVector - InMemoryDataOperationEnum::Set, - 1, // DstScalarStrideInVector - false>{ - f_grid_desc_m, thread_store_global_offset, PassThrough{}}; - - const index_t blockSize = get_block_size(); - const index_t blockPerGrid = get_grid_size(); - const auto M = c_grid_desc_m.GetLength(I0); - const index_t loop_step = blockPerGrid * blockSize * MPerThread; - const auto loop_step_index = make_multi_index(loop_step); - - index_t num_iter = M / (loop_step); - do - { - // read and process MPerThread elements - a_global_load.Run( - a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf); - - b_global_load.Run( - b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf); - - c_global_load.Run( - c_grid_desc_m, c_global_buf, thread_desc_m, make_tuple(I0), c_thread_buf); - - d_global_load.Run( - d_grid_desc_m, d_global_buf, thread_desc_m, make_tuple(I0), d_thread_buf); - - e_global_load.Run( - e_grid_desc_m, e_global_buf, thread_desc_m, make_tuple(I0), e_thread_buf); - - static_for<0, MPerThread, 1>{}([&](auto m) { - constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m)); - functor(f_thread_buf(Number{}), - a_thread_buf(Number{}), - b_thread_buf(Number{}), - c_thread_buf(Number{}), - d_thread_buf(Number{}), - e_thread_buf(Number{})); - }); - - f_global_write.Run(thread_desc_m, - make_tuple(I0), // SrcSliceOriginIdx - f_thread_buf, - f_grid_desc_m, - f_global_buf); - - a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index); - b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index); - c_global_load.MoveSrcSliceWindow(c_grid_desc_m, loop_step_index); - d_global_load.MoveSrcSliceWindow(d_grid_desc_m, loop_step_index); - e_global_load.MoveSrcSliceWindow(e_grid_desc_m, loop_step_index); - f_global_write.MoveDstSliceWindow(f_grid_desc_m, loop_step_index); - } while(--num_iter); - } -}; - -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp deleted file mode 100644 index d4e7d1421d..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -__global__ void kernel_binary_elementwise_1d(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - CDataType* __restrict__ p_c_global, - const AGridDesc_M a_grid_desc_m, - const BGridDesc_M b_grid_desc_m, - const CGridDesc_M c_grid_desc_m, - const ElementwiseFunctor functor) -{ - GridwiseBinEltwise::Run( - p_a_global, p_b_global, p_c_global, a_grid_desc_m, b_grid_desc_m, c_grid_desc_m, functor); -} - -template -struct GridwiseBinaryElementwise_1D -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto thread_desc_m = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); - - using PassThrough = tensor_operation::element_wise::PassThrough; - - static __device__ auto CalculateElementwiseIndex() - { - const index_t global_thread_id = get_thread_global_1d_id(); - return make_multi_index(global_thread_id * MPerThread); - } - - __device__ static void Run(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - CDataType* __restrict__ p_c_global, - const AGridDesc_M a_grid_desc_m, - const BGridDesc_M b_grid_desc_m, - const CGridDesc_M c_grid_desc_m, - const ElementwiseFunctor functor) - { - const auto a_global_buf = make_dynamic_buffer( - p_a_global, a_grid_desc_m.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( - p_b_global, b_grid_desc_m.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( - p_c_global, c_grid_desc_m.GetElementSpaceSize()); - - StaticBuffer a_thread_buf; - StaticBuffer b_thread_buf; - StaticBuffer c_thread_buf; - - const auto thread_store_global_offset = CalculateElementwiseIndex(); - - auto a_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - AScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{a_grid_desc_m, thread_store_global_offset}; - - auto b_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - BScalarPerVector, // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{b_grid_desc_m, thread_store_global_offset}; - - auto c_global_write = - ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // DstVectorDim - CScalarPerVector, // ScalarPerVector - InMemoryDataOperationEnum::Set, - 1, // DstScalarStrideInVector - false>{ - c_grid_desc_m, thread_store_global_offset, PassThrough{}}; - - const index_t blockSize = get_block_size(); - const index_t blockPerGrid = get_grid_size(); - const auto M = c_grid_desc_m.GetLength(I0); - const index_t loop_step = blockPerGrid * blockSize * MPerThread; - const auto loop_step_index = make_multi_index(loop_step); - - index_t num_iter = M / (loop_step); - do - { - // read and process MPerThread elements - a_global_load.Run( - a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf); - - b_global_load.Run( - b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf); - - static_for<0, MPerThread, 1>{}([&](auto m) { - constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m)); - functor(c_thread_buf(Number{}), - a_thread_buf(Number{}), - b_thread_buf(Number{})); - }); - - c_global_write.Run(thread_desc_m, - make_tuple(I0), // SrcSliceOriginIdx - c_thread_buf, - c_grid_desc_m, - c_global_buf); - - a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index); - b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index); - c_global_write.MoveDstSliceWindow(c_grid_desc_m, loop_step_index); - } while(--num_iter); - } -}; - -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp new file mode 100644 index 0000000000..4feb948156 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_description/cluster_descriptor.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_elementwise_1d(const InGrid1dDescTuple in_grid_1d_desc_tuple, + const OutGrid1dDescTuple out_grid_1d_desc_tuple, + const InDataTypePointerTuple p_in_global_tuple, + const OutDataTypePointerTuple p_out_global_tuple, + const ElementwiseOperation elementwise_op) +{ + GridwiseElementwise1dFunctor::Run(in_grid_1d_desc_tuple, + out_grid_1d_desc_tuple, + p_in_global_tuple, + p_out_global_tuple, + elementwise_op); +} + +template +struct GridwiseElementwise_1D +{ + static constexpr index_t NumInput = InDataTypePointerTuple::Size(); + static constexpr index_t NumOutput = OutDataTypePointerTuple::Size(); + + static_assert(NumInput == InScalarPerVectorSeq::Size() && + NumOutput == OutScalarPerVectorSeq::Size() && + NumInput == InGrid1dDescTuple::Size() && + NumOutput == OutGrid1dDescTuple::Size(), + "Tuple size is inconsistent with the number of in/out!"); + + static constexpr auto I0 = Number<0>{}; + + static constexpr auto thread_buffer_desc_m = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + __device__ static void Run(const InGrid1dDescTuple in_grid_1d_desc_tuple, + const OutGrid1dDescTuple out_grid_1d_desc_tuple, + const InDataTypePointerTuple p_in_global_tuple, + const OutDataTypePointerTuple p_out_global_tuple, + const ElementwiseOperation elementwise_op) + { + const index_t thread_global_id = get_thread_global_1d_id(); + + auto in_thread_buf_tuple = generate_tuple( + [&](auto I) { + using DataTypePointer = remove_cvref_t; + using DataType = remove_cv_t>; + + return StaticBuffer{}; + }, + Number{}); + + auto out_thread_buf_tuple = generate_tuple( + [&](auto I) { + using DataTypePointer = remove_cvref_t; + using DataType = remove_pointer_t; + + return StaticBuffer{}; + }, + Number{}); + + auto in_global_buf_tuple = generate_tuple( + [&](auto I) { + return make_dynamic_buffer( + p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize()); + }, + Number{}); + + auto out_global_buf_tuple = generate_tuple( + [&](auto I) { + return make_dynamic_buffer( + p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize()); + }, + Number{}); + + const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread); + + const index_t blockSize = get_block_size(); + const index_t blockPerGrid = get_grid_size(); + const auto M = in_grid_1d_desc_tuple[I0].GetLength(I0); + const index_t loop_step = blockPerGrid * blockSize * MPerThread; + const auto loop_step_index = make_multi_index(loop_step); + + auto in_global_load_tuple = generate_tuple( + [&](auto I) { + using DataTypePointer = remove_cvref_t; + using DataType = remove_cv_t>; + + return ThreadwiseTensorSliceTransfer_v2, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + InScalarPerVectorSeq::At( + I), // ScalarPerVector + 1, // SrcScalarStrideInVector + false>{in_grid_1d_desc_tuple[I], + thread_global_offset}; + }, + Number{}); + + auto out_global_store_tuple = generate_tuple( + [&](auto I) { + using DataTypePointer = remove_cvref_t; + using DataType = remove_pointer_t; + + return ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + OutScalarPerVectorSeq::At(I), + InMemoryDataOperationEnum::Set, + 1, + false>( + out_grid_1d_desc_tuple[I], thread_global_offset, PassThroughOp{}); + }, + Number{}); + + index_t num_iter = M / (loop_step); + do + { + static_for<0, NumInput, 1>{}([&](auto I) { + in_global_load_tuple(I).Run(in_grid_1d_desc_tuple[I], + in_global_buf_tuple[I], + thread_buffer_desc_m, + make_tuple(I0), + in_thread_buf_tuple(I)); + + in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_1d_desc_tuple[I], + loop_step_index); + }); + + static_for<0, MPerThread, 1>{}([&](auto iM) { + // get reference to in data + const auto in_data_refs = generate_tie( + // return type should be lvalue + [&](auto I) -> const auto& { return in_thread_buf_tuple(I)(iM); }, + Number{}); + + // get reference to dst data + auto out_data_refs = generate_tie( + // return type should be lvalue + [&](auto I) -> auto& { return out_thread_buf_tuple(I)(iM); }, + Number{}); + + unpack2(elementwise_op, out_data_refs, in_data_refs); + }); + + static_for<0, NumOutput, 1>{}([&](auto I) { + out_global_store_tuple(I).Run(thread_buffer_desc_m, + make_tuple(I0), + out_thread_buf_tuple[I], + out_grid_1d_desc_tuple[I], + out_global_buf_tuple(I)); + + out_global_store_tuple(I).MoveDstSliceWindow(out_grid_1d_desc_tuple[I], + loop_step_index); + }); + } while(--num_iter); + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp new file mode 100644 index 0000000000..88c7b6acfe --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void +kernel_multiple_buffer_set_value(const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, + DataTypePointerTuple p_global_tuple, + DataTypeTuple value_tuple) + +{ + static_assert(NumBuffer == DataTypePointerTuple::Size() && NumBuffer == DataTypeTuple::Size(), + "The tuple size should be same as NumBuffer!"); + + static_for<0, NumBuffer, 1>{}([&](auto iB) { + using DataTypePointer = remove_cvref_t; + using DataTypeFromPointer = remove_pointer_t; + using DataType = remove_cvref_t; + + static_assert(is_same::value, + "Types in tuples does not match!"); + }); + + constexpr auto I0 = Number<0>{}; + + const index_t thread_global_id = get_thread_global_1d_id(); + + auto value_buf_tuple = generate_tuple( + [&](auto iB) { + using DataType = remove_cvref_t; + + return StaticBuffer{}; + }, + Number{}); + + static_for<0, NumBuffer, 1>{}([&](auto iB) { + static_for<0, 1, 1>{}([&](auto J) { value_buf_tuple(iB)(J) = value_tuple[iB]; }); + }); + + auto global_buf_tuple = generate_tuple( + [&](auto iB) { + return make_dynamic_buffer( + p_global_tuple(iB), grid_1d_buffer_desc_tuple[iB].GetElementSpaceSize()); + }, + Number{}); + + constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + static_for<0, NumBuffer, 1>{}([&](auto iB) { + using DataType = remove_cvref_t; + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + auto threadwise_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + grid_1d_buffer_desc_tuple[iB], make_multi_index(thread_global_id), PassThroughOp{}); + + threadwise_store.Run(val_buff_desc, + make_tuple(I0), + value_buf_tuple(iB), + grid_1d_buffer_desc_tuple[iB], + global_buf_tuple(iB)); + }); +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp deleted file mode 100644 index 6e7fbbc6c6..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -namespace ck { - -template -__global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global, - BDataType* __restrict__ p_b_global, - const GridDesc_M0 a_grid_desc_m0, - const GridDesc_M0 b_grid_desc_m0, - const ElementwiseFunctor functor) -{ - GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor); -} - -template -struct GridwiseUnaryElementwise_1D -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto thread_desc_m0 = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); - - using PassThrough = tensor_operation::element_wise::PassThrough; - - static __device__ auto CalculateElementwiseIndex() - { - const index_t global_thread_id = get_thread_global_1d_id(); - return make_multi_index(global_thread_id * ScalarPerVector); - } - - __host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0, - const GridDesc_M0 b_grid_desc_m0) - { - return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0); - } - - __host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size) - { - const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector); - - return grid_size; - } - - __device__ static void Run(const ADataType* __restrict__ p_a_global, - BDataType* __restrict__ p_b_global, - const GridDesc_M0 a_grid_desc_m0, - const GridDesc_M0 b_grid_desc_m0, - const ElementwiseFunctor functor) - { - const auto a_global_buf = make_dynamic_buffer( - p_a_global, a_grid_desc_m0.GetElementSpaceSize()); - auto b_global_buf = make_dynamic_buffer( - p_b_global, b_grid_desc_m0.GetElementSpaceSize()); - - StaticBuffer a_thread_buf; - StaticBuffer b_thread_buf; - - const auto thread_store_global_offset = CalculateElementwiseIndex(); - - auto a_global_load = - ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - ScalarPerVector, - 1, // SrcScalarStrideInVector - false>{a_grid_desc_m0, thread_store_global_offset}; - - auto b_global_write = - ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // DstVectorDim - ScalarPerVector, - InMemoryDataOperationEnum::Set, - 1, // DstScalarStrideInVector - false>{ - b_grid_desc_m0, thread_store_global_offset, PassThrough{}}; - - const index_t blockSize = get_block_size(); - const index_t blockPerGrid = get_grid_size(); - const auto m0 = b_grid_desc_m0.GetLength(I0); - const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector; - const auto loop_step_index = make_multi_index(loop_step); - - index_t num_iter = m0 / (loop_step); - do - { - // read and process ScalarPerVector elements - a_global_load.Run( - a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf); - - static_for<0, ScalarPerVector, 1>{}([&](auto m) { - constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m)); - functor(b_thread_buf(Number{}), a_thread_buf(Number{})); - }); - - b_global_write.Run(thread_desc_m0, - make_tuple(I0), // SrcSliceOriginIdx - b_thread_buf, - b_grid_desc_m0, - b_global_buf); - - a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index); - b_global_write.MoveDstSliceWindow(b_grid_desc_m0, loop_step_index); - } while(--num_iter); - } -}; - -} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp new file mode 100644 index 0000000000..fa45af4997 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceBatchNormFwd_Input_N_H_W_C_Output_C : public device::DeviceBatchNormFwd<4, 3> +{ + struct Argument : public device::BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const InOutDataType* p_x, + const AccDataType* bnScale, + const AccDataType* bnBias, + InOutDataType* p_y, + double exponentialAverageFactor, + AccDataType* resultRunningMean, + AccDataType* resultRunningVariance, + double epsilon, + AccDataType* resultSaveMean, + AccDataType* resultSaveInvVariance) + : p_x_(p_x), + bnScale_(bnScale), + bnBias_(bnBias), + p_y_(p_y), + resultRunningMean_(resultRunningMean), + resultRunningVariance_(resultRunningVariance), + resultSaveMean_(resultSaveMean), + resultSaveInvVariance_(resultSaveInvVariance), + exponentialAverageFactor_(exponentialAverageFactor), + epsilon_(epsilon) + { + (void)xStrides; + (void)yStrides; + (void)bnScaleBiasMeanVarStrides; + + if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 || + bnScaleBiasMeanVarLengths[0] != xyLengths[3]) + throw std::runtime_error("Invalid tensor dimensions!"); + + n = xyLengths[0]; + h = xyLengths[1]; + w = xyLengths[2]; + c = xyLengths[3]; + + resultSave = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr); + resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr); + } + + const InOutDataType* p_x_; + const AccDataType* bnScale_; + const AccDataType* bnBias_; + InOutDataType* p_y_; + + AccDataType* resultRunningMean_; + AccDataType* resultRunningVariance_; + AccDataType* resultSaveMean_; + AccDataType* resultSaveInvVariance_; + + bool resultSave, resultRunning; + + index_t n, h, w, c; + + double exponentialAverageFactor_; + double epsilon_; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + auto thread_reduce_func = [&](auto iC) { + AccDataType reduceSize = type_convert(arg.n) * + type_convert(arg.h) * + type_convert(arg.w); + index_t offset_C = iC; + AccDataType mean = type_convert(0.0f); + AccDataType meansquare = type_convert(0.0f); + + // compute mean, meanquare, variance, invVariance + for(index_t iN = 0; iN < arg.n; iN++) + { + index_t offset_N = iN * arg.h * arg.w * arg.c; + for(index_t iH = 0; iH < arg.h; iH++) + { + index_t offset_H = iH * arg.w * arg.c; + for(index_t iW = 0; iW < arg.w; iW++) + { + index_t offset_W = iW * arg.c; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + mean += x; + meansquare += x * x; + }; + } + }; + + mean = mean / reduceSize; + meansquare = meansquare / reduceSize; + + AccDataType variance = meansquare - mean * mean; + AccDataType invVariance = + type_convert(1.0f) / + std::sqrt(type_convert(arg.epsilon_) + variance); + + // save the mean/invVariance if required + if(arg.resultSave) + { + arg.resultSaveMean_[iC] = mean; + arg.resultSaveInvVariance_[iC] = invVariance; + }; + + // update the moving average if required + if(arg.resultRunning) + { + arg.resultRunningMean_[iC] = + arg.resultRunningMean_[iC] * + type_convert(1.0 - arg.exponentialAverageFactor_) + + mean * arg.exponentialAverageFactor_; + arg.resultRunningVariance_[iC] = + arg.resultRunningVariance_[iC] * + type_convert(1.0 - arg.exponentialAverageFactor_) + + variance * arg.exponentialAverageFactor_; + }; + + // Normalization + for(index_t iN = 0; iN < arg.n; iN++) + { + index_t offset_N = iN * arg.h * arg.w * arg.c; + for(index_t iH = 0; iH < arg.h; iH++) + { + index_t offset_H = iH * arg.w * arg.c; + for(index_t iW = 0; iW < arg.w; iW++) + { + index_t offset_W = iW * arg.c; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + AccDataType norm_x = + arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC]; + + arg.p_y_[offset] = type_convert(norm_x); + }; + } + }; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t ic_begin = it * work_per_thread; + std::size_t ic_end = std::min(static_cast((it + 1) * work_per_thread), arg.c); + + auto f = [=] { + for(std::size_t ic = ic_begin; ic < ic_end; ++ic) + { + thread_reduce_func(ic); + } + }; + + threads[it] = joinable_thread(f); + } + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + (void)p_arg; + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + void* p_y, + double exponentialAverageFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) override + { + return std::make_unique(xyLengths, + xStrides, + yStrides, + bnScaleBiasMeanVarLengths, + bnScaleBiasMeanVarStrides, + static_cast(p_x), + static_cast(bnScale), + static_cast(bnBias), + static_cast(p_y), + exponentialAverageFactor, + static_cast(resultRunningMean), + static_cast(resultRunningVariance), + epsilon, + static_cast(resultSaveMean), + static_cast(resultSaveInvVariance)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp new file mode 100644 index 0000000000..45092861f2 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceBatchNormInfer_Input_N_H_W_C_Output_C : public device::DeviceBatchNormInfer<4, 3> +{ + struct Argument : public device::BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const InOutDataType* p_x, + const AccDataType* bnScale, + const AccDataType* bnBias, + double epsilon, + const AccDataType* estimatedMean, + const AccDataType* estimatedVariance, + InOutDataType* p_y) + : p_x_(p_x), + bnScale_(bnScale), + bnBias_(bnBias), + epsilon_(epsilon), + estimatedMean_(estimatedMean), + estimatedVariance_(estimatedVariance), + p_y_(p_y) + { + (void)xStrides; + (void)yStrides; + (void)bnScaleBiasMeanVarStrides; + + if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 || + bnScaleBiasMeanVarLengths[0] != xyLengths[3]) + throw std::runtime_error("Invalid tensor dimensions!"); + + n = xyLengths[0]; + h = xyLengths[1]; + w = xyLengths[2]; + c = xyLengths[3]; + } + + const InOutDataType* p_x_; + const AccDataType* bnScale_; + const AccDataType* bnBias_; + + double epsilon_; + + const AccDataType* estimatedMean_; + const AccDataType* estimatedVariance_; + + InOutDataType* p_y_; + + index_t n, h, w, c; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + auto thread_reduce_func = [&](auto iC) { + index_t offset_C = iC; + AccDataType mean = arg.estimatedMean_[offset_C]; + AccDataType variance = arg.estimatedVariance_[offset_C]; + + AccDataType invVariance = + type_convert(1.0f) / + std::sqrt(type_convert(arg.epsilon_) + variance); + + // Normalization + for(index_t iN = 0; iN < arg.n; iN++) + { + index_t offset_N = iN * arg.h * arg.w * arg.c; + for(index_t iH = 0; iH < arg.h; iH++) + { + index_t offset_H = iH * arg.w * arg.c; + for(index_t iW = 0; iW < arg.w; iW++) + { + index_t offset_W = iW * arg.c; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + AccDataType norm_x = + arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC]; + + arg.p_y_[offset] = type_convert(norm_x); + }; + } + }; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t ic_begin = it * work_per_thread; + std::size_t ic_end = std::min(static_cast((it + 1) * work_per_thread), arg.c); + + auto f = [=] { + for(std::size_t ic = ic_begin; ic < ic_end; ++ic) + { + thread_reduce_func(ic); + } + }; + + threads[it] = joinable_thread(f); + } + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + (void)p_arg; + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleBiasMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + double epsilon, + const void* estimatedMean, + const void* estimatedVariance, + void* p_y) override + { + return std::make_unique(xyLengths, + xStrides, + yStrides, + bnScaleBiasMeanVarLengths, + bnScaleBiasMeanVarStrides, + static_cast(p_x), + static_cast(bnScale), + static_cast(bnBias), + epsilon, + static_cast(estimatedMean), + static_cast(estimatedVariance), + static_cast(p_y)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp index a9cc8b79dd..a71bbe3e58 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp @@ -17,9 +17,12 @@ namespace tensor_operation { namespace device { namespace instance { -using Normalize = ck::tensor_operation::element_wise::Normalize; -using DeviceNormalizeFromMeanMeanSquarePtr = - ck::tensor_operation::device::DeviceElementwisePtr<5, 1, 2, Normalize>; +using Normalize = ck::tensor_operation::element_wise::Normalize; +using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr< + Tuple, + Tuple, + Normalize, + 2>; void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( std::vector& instances); diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp index b2edaa0eb3..4259862e65 100644 --- a/library/include/ck/library/utility/host_tensor_generator.hpp +++ b/library/include/ck/library/utility/host_tensor_generator.hpp @@ -5,6 +5,7 @@ #include #include +#include #include "ck/ck.hpp" @@ -126,6 +127,23 @@ struct GeneratorTensor_3 } }; +template +struct GeneratorTensor_4 +{ + std::default_random_engine generator; + std::normal_distribution distribution; + + GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){}; + + template + T operator()(Is...) + { + float tmp = distribution(generator); + + return ck::type_convert(tmp); + } +}; + struct GeneratorTensor_Checkboard { template diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index 12f7901c16..a4e35cfbfd 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -27,19 +27,17 @@ using outputType = F16; using Normalize = ck::tensor_operation::element_wise::Normalize; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< // clang-format off - //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector| - //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector| - //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector| - //###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector| - Device5AryElementwise, - Device5AryElementwise, - Device5AryElementwise, - Device5AryElementwise + //###################|| | functor| NDim| MPerThread| | | + DeviceElementwise, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, + DeviceElementwise, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, + DeviceElementwise, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >, + DeviceElementwise, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > // clang-format on >; void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( - std::vector>& instances) + std::vector, Tuple, Normalize, 2>>& + instances) { add_device_operation_instances( instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});