From 1d0b1ea6b082601f399adc7f0fd35e0f01e98e10 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Sat, 18 Jun 2022 04:10:25 +0800 Subject: [PATCH] Regulate reduction accumulator operations and Element-wise operations (#274) * Remove template from Reducton operation classes and add template to their operator() and GetIdentityValue() interfaces * Change to unary elementwise operators and the reduce_unary_operator (class for mapping) and dependent variations in all host layers * Remove the data type template parameter from reduce_binary_operator (class for mapping) and dependent variations in host layers * Add InMemoryDataOperatonSupportedOnDataType to check the matching between data type and InMemoryDataOperation * Use struct-scope operator template instantiation for binary and unary element-wise operations * Change a few more elementwise operations to use template for operator() * Tiny correction in Normalize operator * Add static_assert to check the data type appliability for some reduction accumulator and element-wise operatons * Correction in some examples with regard to using ReduceAccDataType * Use static_assert for UnaryDivide * Update to merged codes to use Element-wise operations and Reduction Accumulator operations correctly * Tiny fix with regard to SetWorkSpacePointer() [ROCm/composable_kernel commit: 1f543bfa79de0687f9b6144b5dea10f4190c8892] --- example/12_reduce/reduce_blockwise.cpp | 49 +-- .../12_reduce/reduce_blockwise_two_call.cpp | 77 +++-- example/13_pool2d_fwd/pool2d_fwd_common.hpp | 19 +- .../gemm_reduce_xdl_max_fp16.cpp | 13 +- .../gemm_reduce_xdl_mean_squaremean_fp16.cpp | 28 +- .../batched_gemm_reduce_xdl_fp16.cpp | 25 +- .../broadcast_add_2d_amn_bn.cpp | 3 +- .../broadcast_add_3d_am_bmnk.cpp | 3 +- .../elementwise_add_1d.cpp | 3 +- .../elementwise_add_4d.cpp | 3 +- .../gemm_bias_relu_add_layernorm_xdl_fp16.cpp | 26 +- .../gemm_layernorm_xdl_fp16.cpp | 31 +- .../gpu/device/device_base.hpp | 2 +- .../device_cgemm_4gemm_xdl_cshuffle.hpp | 58 ++-- .../device/device_pool2d_fwd_nhwc_nhwc.hpp | 18 +- .../gpu/device/device_reduce_multiblock.hpp | 13 +- .../gpu/device/reduction_operator_mapping.hpp | 161 +++++---- .../element/binary_element_wise_operation.hpp | 219 +++++++++---- .../gpu/element/element_wise_operation.hpp | 305 ++---------------- .../element/unary_element_wise_operation.hpp | 80 +++++ .../grid/gridwise_2d_reduction_multiblock.hpp | 20 +- .../grid/gridwise_2d_reduction_threadwise.hpp | 20 +- ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp | 3 +- .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp | 3 +- .../gpu/grid/gridwise_set_buffer_value.hpp | 2 +- include/ck/utility/reduction_operator.hpp | 147 +++++++-- .../ck/library/host_tensor/host_reduction.hpp | 33 +- .../cpu/reference_conv_bwd_data.hpp | 5 +- .../cpu/reference_gemm_bias_2d.hpp | 4 +- .../device_reduce_instance_blockwise.hpp | 43 ++- ..._reduce_instance_multiblock_atomic_add.hpp | 49 ++- .../device_reduce_instance_threadwise.hpp | 43 ++- ...6_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp | 6 +- ...6_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp | 6 +- ...6_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp | 6 +- ...6_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp | 6 +- ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 8 +- ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 8 +- .../profile_batched_gemm_reduce_impl.hpp | 28 +- .../profile_gemm_bias_add_reduce_impl.hpp | 40 ++- profiler/include/profile_gemm_reduce_impl.hpp | 36 +-- profiler/include/profile_reduce_impl.hpp | 28 +- 48 files changed, 891 insertions(+), 837 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index cc75bbad60..66e9762314 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -33,11 +33,11 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; constexpr bool PropagateNan = true; constexpr bool OutputIndex = false; -using ReduceOperation = typename reduce_binary_operator::opType; +using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; using DeviceReduceInstance = DeviceReduceMultiBlock::GetElementwiseOperator( + static_cast(reduce_total_length)); + if(args.do_verification) { ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run( - alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); + hostReduce.Run(alpha, + in.mData.data(), + beta, + out_ref.mData.data(), + out_indices_ref.mData.data(), + in_elementwise_op, + acc_elementwise_op); }; std::vector i_inLengths; @@ -277,20 +289,19 @@ int main(int argc, char* argv[]) auto reduce = DeviceReduceInstance{}; - auto argument_ptr = reduce.MakeArgumentPointer( - i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - nullptr, - out_dev.GetDeviceBuffer(), - out_index_dev.GetDeviceBuffer(), - InElementwiseOperation{static_cast(reduce_total_length)}, - AccElementwiseOperation{static_cast(reduce_total_length)}); + auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + out_index_dev.GetDeviceBuffer(), + in_elementwise_op, + acc_elementwise_op); if(!reduce.IsSupportedArgument(argument_ptr.get())) { diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index f42fd08f1e..e4823667a8 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -31,13 +31,13 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; constexpr bool PropagateNan = true; constexpr bool OutputIndex = false; -using ReduceOperation = typename reduce_binary_operator::opType; +using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; -using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; +using PassThroughOp = tensor_operation::element_wise::PassThrough; using DeviceReduceInstance_1 = DeviceReduceMultiBlock::GetElementwiseOperator( + static_cast(reduce_total_length)); + if(do_verify) { ReductionHost hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run(alpha, in_1.mData.data(), beta, out_ref.mData.data(), nullptr); + hostReduce.Run(alpha, + in_1.mData.data(), + beta, + out_ref.mData.data(), + nullptr, + in_elementwise_op, + acc_elementwise_op); }; std::vector i_inLengths_1; @@ -217,20 +230,19 @@ int main(int argc, char* argv[]) auto reduce_1 = DeviceReduceInstance_1{}; - auto argument_ptr_1 = reduce_1.MakeArgumentPointer( - i_inLengths_1, - i_inStrides_1, - i_inLengths_2, - i_inStrides_2, - reduceDims_1, - 1.0f, - 0.0f, - in_1_dev.GetDeviceBuffer(), - nullptr, - in_2_dev.GetDeviceBuffer(), - nullptr, - InElementwiseOperation{static_cast(reduce_total_length)}, - PassThroughOp{}); + auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, + i_inStrides_1, + i_inLengths_2, + i_inStrides_2, + reduceDims_1, + 1.0f, + 0.0f, + in_1_dev.GetDeviceBuffer(), + nullptr, + in_2_dev.GetDeviceBuffer(), + nullptr, + in_elementwise_op, + PassThroughOp{}); if(!reduce_1.IsSupportedArgument(argument_ptr_1.get())) { @@ -243,20 +255,19 @@ int main(int argc, char* argv[]) auto reduce_2 = DeviceReduceInstance_2{}; - auto argument_ptr_2 = reduce_2.MakeArgumentPointer( - i_inLengths_2, - i_inStrides_2, - i_outLengths, - i_outStrides, - reduceDims_2, - alpha, - beta, - in_2_dev.GetDeviceBuffer(), - nullptr, - out_dev.GetDeviceBuffer(), - nullptr, - PassThroughOp{}, - AccElementwiseOperation{static_cast(reduce_total_length)}); + auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, + i_inStrides_2, + i_outLengths, + i_outStrides, + reduceDims_2, + alpha, + beta, + in_2_dev.GetDeviceBuffer(), + nullptr, + out_dev.GetDeviceBuffer(), + nullptr, + PassThroughOp{}, + acc_elementwise_op); if(!reduce_2.IsSupportedArgument(argument_ptr_2.get())) { diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index 4652ce1189..436bbcd485 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -31,16 +31,15 @@ static void pool_host_verify(const Tensor& in, const std::array& in_left_pads, const std::array& /*in_right_pads*/) { - const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1]; + const int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1]; - using ReduceOperation = typename ck::reduce_binary_operator::opType; - using InElementwiseOperation = typename ck:: - reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = typename ck:: - reduce_unary_operator::AccElementwiseOperation; + using ReduceOperation = typename ck::reduce_binary_operator::opType; - const InElementwiseOperation in_elementwise_op(divider); - const AccElementwiseOperation acc_elementwise_op(divider); + auto elementwise_ops = + ck::reduce_unary_operator::GetElementwiseOperator(reduceLength); + + auto in_elementwise_op = std::get<0>(elementwise_ops); + auto acc_elementwise_op = std::get<1>(elementwise_ops); if constexpr(!OutputIndex) { @@ -48,7 +47,7 @@ static void pool_host_verify(const Tensor& in, ck::detail::AccumulateWithNanCheck; auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { - auto accuVal = ReduceOperation::GetIdentityValue(); + auto accuVal = ReduceOperation::template GetIdentityValue(); for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y) { @@ -86,7 +85,7 @@ static void pool_host_verify(const Tensor& in, AccDataType, IndexDataType>; auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { - auto accuVal = ReduceOperation::GetIdentityValue(); + auto accuVal = ReduceOperation::template GetIdentityValue(); IndexDataType accuIndex = 0; for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y) diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp index 4469130502..8f0d25059d 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp @@ -41,9 +41,8 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -using DsReduceOp = ck::Tuple>; -using DsElementOp = ck::Tuple< - ck::tensor_operation::element_wise::UnaryIdentic>; +using DsReduceOp = ck::Tuple; +using DsElementOp = ck::Tuple; using DGlobalMemOp = ck::InMemoryDataOperationEnumSequence; @@ -236,10 +235,14 @@ int main(int argc, char* argv[]) for(int m = 0; m < M; ++m) { - ReduceAccDataType d_acc = d_reduce_op.GetIdentityValue(); + ReduceAccDataType d_acc = d_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) - d_reduce_op(d_acc, c_m_n_host_result(m, n)); + { + ReduceAccDataType curr_val = + ck::type_convert(c_m_n_host_result(m, n)); + d_reduce_op(d_acc, curr_val); + }; d_m_host_result(m) = d_acc; } diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp index 5122317719..018645e066 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp @@ -41,18 +41,15 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -using D0ReduceOp = ck::reduce::Add; -using D1ReduceOp = ck::reduce::Add; +using D0ReduceOp = ck::reduce::Add; +using D1ReduceOp = ck::reduce::Add; using DxsReduceOp = ck::Tuple; -using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnaryDivElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; -using DxsInElementOps = ck::Tuple; -using DxsOutElementOps = ck::Tuple; +using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; +using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; +using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; +using DxsInElementOps = ck::Tuple; +using DxsOutElementOps = ck::Tuple; using DGlobalMemOp = ck::InMemoryDataOperationEnumSequence(); + auto d1_acc = d1_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) { - ReduceAccDataType c_val = - ck::type_convert(c_m_n_host_result(m, n)); - ReduceAccDataType d0_val = 0; - ReduceAccDataType d1_val = 0; + auto c_val = ck::type_convert(c_m_n_host_result(m, n)); + ReduceAccDataType d0_val; + ReduceAccDataType d1_val; dxs_in_element_op(ck::Number<0>{})(d0_val, c_val); dxs_in_element_op(ck::Number<1>{})(d1_val, c_val); diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index e89f8a61e0..de584ad7e8 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -39,16 +39,14 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -using D0ReduceOp = ck::reduce::Add; -using D1ReduceOp = ck::reduce::Add; +using D0ReduceOp = ck::reduce::Add; +using D1ReduceOp = ck::reduce::Add; using DxsReduceOp = ck::Tuple; -using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; -using DxsInElementOps = ck::Tuple; -using DxsOutElementOps = ck::Tuple; +using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; +using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; +using DxsInElementOps = ck::Tuple; +using DxsOutElementOps = ck::Tuple; using DGlobalMemOp = ck::InMemoryDataOperationEnumSequence(); + auto d1_acc = d1_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) { - float c_val = ck::type_convert(c_g_m_n_host_result(batch, m, n)); - float d0_val = 0; - float d1_val = 0; + auto c_val = + ck::type_convert(c_g_m_n_host_result(batch, m, n)); + ReduceAccDataType d0_val; + ReduceAccDataType d1_val; UnaryIdenticElementOp{}(d0_val, c_val); UnarySquareElementOp{}(d1_val, c_val); diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 54557b6e7e..587882ed9c 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -42,8 +42,7 @@ using ABDataType = F16; using CDataType = F16; using EltwiseComputeDataType = F32; -using Add = ck::tensor_operation::binary_element_wise:: - Add; +using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device::DeviceBinaryElementwise; +using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device::DeviceBinaryElementwise; +using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device::DeviceBinaryElementwise; +using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device::DeviceBinaryElementwise; +using ReduceSumOp = ck::reduce::Add; using DxsReduceOp = ck::Tuple; -using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnaryDivElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; -using DxsInElementOps = ck::Tuple; -using DxsOutElementOps = ck::Tuple; +using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; +using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; +using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; +using DxsInElementOps = ck::Tuple; +using DxsOutElementOps = ck::Tuple; using DxsGlobalMemOp = ck::InMemoryDataOperationEnumSequence& out_m_n, auto reduceSumOpInst = ReduceSumOp{}; for(int m = 0; m < M; ++m) { - AccDataType mean_acc = reduceSumOpInst.GetIdentityValue(); - AccDataType square_mean_acc = reduceSumOpInst.GetIdentityValue(); + auto mean_acc = reduceSumOpInst.GetIdentityValue(); + auto square_mean_acc = reduceSumOpInst.GetIdentityValue(); for(int n = 0; n < N; ++n) { @@ -207,7 +204,12 @@ void host_gemm_layernorm(Tensor& out_m_n, for(int n = 0; n < N; ++n) { AccDataType out_acc = 0; - layerNormInst(out_acc, c_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n)); + layerNormInst(out_acc, + static_cast(c_m_n(m, n)), + static_cast(mean_m(m)), + static_cast(meanSquare_m(m)), + static_cast(gamma_n(n)), + static_cast(beta_n(n))); out_m_n(m, n) = static_cast(out_acc); } } diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp index d6890a31cd..3bf01aa9da 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp @@ -44,17 +44,14 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -using ReduceSumOp = ck::reduce::Add; +using ReduceSumOp = ck::reduce::Add; using DxsReduceOp = ck::Tuple; -using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnaryDivElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; -using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; -using DxsInElementOps = ck::Tuple; -using DxsOutElementOps = ck::Tuple; +using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; +using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; +using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; +using DxsInElementOps = ck::Tuple; +using DxsOutElementOps = ck::Tuple; using DxsGlobalMemOp = ck::InMemoryDataOperationEnumSequence& out_m_n, auto reduceSumOpInst = ReduceSumOp{}; for(int m = 0; m < M; ++m) { - float mean_acc = reduceSumOpInst.GetIdentityValue(); - float square_mean_acc = reduceSumOpInst.GetIdentityValue(); + auto mean_acc = reduceSumOpInst.GetIdentityValue(); + auto square_mean_acc = reduceSumOpInst.GetIdentityValue(); for(int n = 0; n < N; ++n) { - ReduceAccDataType c_val = ck::type_convert(c_m_n(m, n)); - ReduceAccDataType square_c_val = 0; + auto c_val = ck::type_convert(c_m_n(m, n)); + auto square_c_val = reduceSumOpInst.GetIdentityValue(); + UnarySquareElementOp{}(square_c_val, c_val); reduceSumOpInst(mean_acc, c_val); @@ -182,7 +180,12 @@ void host_gemm_layernorm(Tensor& out_m_n, for(int n = 0; n < N; ++n) { float out_f32 = 0; - layerNormInst(out_f32, c_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n)); + layerNormInst(out_f32, + static_cast(c_m_n(m, n)), + static_cast(mean_m(m)), + static_cast(meanSquare_m(m)), + static_cast(gamma_n(n)), + static_cast(beta_n(n))); out_m_n(m, n) = static_cast(out_f32); } } diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index 40b9b07a01..809eba5578 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -44,7 +44,7 @@ struct BaseOperator virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; } - virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const final + virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const { assert(p_arg); p_arg->p_workspace_ = p_workspace; diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp index 4e1aada6da..df2805b886 100644 --- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp @@ -557,11 +557,9 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle float ave_time = 0; - using Add = - ck::tensor_operation::binary_element_wise::Add; - using Substract = ck::tensor_operation::binary_element_wise:: - Substract; - using GridwiseBinAdd = GridwiseBinaryElementwise_1D; - using GridwiseBinSubstract = GridwiseBinaryElementwise_1D; - const auto add_kernel = kernel_binary_elementwise_1d; + const auto add_kernel = kernel_binary_elementwise_1d; - const auto substract_kernel = kernel_binary_elementwise_1d; + const auto subtract_kernel = kernel_binary_elementwise_1d; if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) { @@ -653,7 +651,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle // c_real = aux - aux_2 ave_time += launch_and_time_kernel(stream_config, - substract_kernel, + subtract_kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -663,7 +661,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.c_grid_desc_m_, arg.c_grid_desc_m_, arg.c_grid_desc_m_, - Substract{}); + Subtract{}); ave_time += launch_and_time_kernel(stream_config, @@ -764,7 +762,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle // c_real = aux - aux_2 ave_time += launch_and_time_kernel(stream_config, - substract_kernel, + subtract_kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -774,7 +772,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle arg.c_grid_desc_m_, arg.c_grid_desc_m_, arg.c_grid_desc_m_, - Substract{}); + Subtract{}); ave_time += launch_and_time_kernel(stream_config, diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp index c7e18d98dc..41fb11b7de 100644 --- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp @@ -35,14 +35,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd using IndexDataType = int32_t; - using ReduceOperation = typename reduce_binary_operator::opType; + using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; static constexpr index_t InSrcOutDstVectorDim = 0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is @@ -178,13 +177,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd invariant_lowest_length_ = C; reduce_lowest_length_ = window_spatial_lengths[1]; - // TODO: is this correct? - if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG) - { - ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1]; - in_element_op_ = InElementwiseOperation{divider}; - acc_element_op_ = AccElementwiseOperation{divider}; - } + int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1]; + + std::tie(in_element_op_, acc_element_op_) = + reduce_unary_operator::GetElementwiseOperator(reduceLength); } const InDataType* p_in_dev_; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp index 575c6bff1d..6401455bd5 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp @@ -61,12 +61,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce::value || std::is_same::value; - - static_assert( - !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op), - "The OutDataType must support the atomic operation for using MultiBlock reduction"); + static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType::value, + "The OutDataType must support the specified OutMemoryDataOperation!"); static_assert(!use_multiblock || (use_multiblock && !OutputIndex), "MultiBlock reduction can only be used when outputing index is not required"); @@ -349,7 +346,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce( + ck::reduce::GetIdentityValueForInMemoryDataOperation( OutMemoryDataOperation); const auto kernel_pre = @@ -492,7 +489,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce"; diff --git a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp index 634e9212ea..4b3f52148d 100644 --- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp +++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp @@ -29,6 +29,7 @@ #include "reduction_operator.hpp" #include "reduction_enums.hpp" #include "element_wise_operation.hpp" +#include namespace ck { @@ -37,77 +38,69 @@ namespace ck { // The boolean member "indexable" are also provided in reduce_binary_operactor for // easier checking by the upper-layer codes in the kernels. -template +template struct reduce_binary_operator; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Add; - using dataType = T; + using opType = reduce::Add; static constexpr bool indexable = false; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Mul; - using dataType = T; + using opType = reduce::Mul; static constexpr bool indexable = false; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Min; - using dataType = T; + using opType = reduce::Min; static constexpr bool indexable = true; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Max; - using dataType = T; + using opType = reduce::Max; static constexpr bool indexable = true; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::AMax; - using dataType = T; + using opType = reduce::AMax; static constexpr bool indexable = true; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Add; - using dataType = T; + using opType = reduce::Add; static constexpr bool indexable = false; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Add; - using dataType = T; + using opType = reduce::Add; static constexpr bool indexable = false; }; -template -struct reduce_binary_operator +template <> +struct reduce_binary_operator { - using opType = reduce::Add; - using dataType = T; + using opType = reduce::Add; static constexpr bool indexable = false; }; @@ -115,53 +108,101 @@ struct reduce_binary_operator // The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary // functor classes. // The two unary functors are called before and afer the Reduction is executed respectively -template +template struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; - using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using InElementwiseOperation = tensor_operation::element_wise::PassThrough; + using AccElementwiseOperation = tensor_operation::element_wise::PassThrough; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; -template -struct reduce_unary_operator +template +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; - using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using InElementwiseOperation = tensor_operation::element_wise::PassThrough; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryDivide; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{reduceLength}); + }; }; -template -struct reduce_unary_operator +template +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; - using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; + using AccElementwiseOperation = tensor_operation::element_wise::PassThrough; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; -template -struct reduce_unary_operator +template +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; - using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; + using AccElementwiseOperation = tensor_operation::element_wise::PassThrough; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; -template -struct reduce_unary_operator +template <> +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; - using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; + using AccElementwiseOperation = tensor_operation::element_wise::PassThrough; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; -template -struct reduce_unary_operator +template <> +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; - using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; + using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; + using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; -template -struct reduce_unary_operator +template <> +struct reduce_unary_operator { - using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; - using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; + using InElementwiseOperation = tensor_operation::element_wise::PassThrough; + using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; + + static std::tuple + GetElementwiseOperator(int32_t reduceLength) + { + (void)reduceLength; + return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{}); + }; }; } // end of namespace ck diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 1032f0f8fc..bc1b11d468 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -28,100 +28,189 @@ namespace ck { namespace tensor_operation { -namespace binary_element_wise { -template -struct Add; +namespace element_wise { -template <> -struct Add +struct Add { + template + __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; + + template <> __host__ __device__ constexpr void - operator()(double& dst, const double& src1, const double& src2) const + operator()(float& y, const float& x0, const float& x1) const { - dst = src1 + src2; + y = x0 + x1; + }; + + template <> + __host__ __device__ constexpr void + operator()(double& y, const double& x0, const double& x1) const + { + y = x0 + x1; + }; + + // Question: should half_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(half_t& y, const half_t& x0, const half_t& x1) const + { + y = x0 + x1; + }; + + // Question: should bhalf_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const + { + const float x1_tmp = ck::type_convert(x0); + const float x2_tmp = ck::type_convert(x1); + const float y_tmp = x1_tmp + x2_tmp; + y = ck::type_convert(y_tmp); } }; -template <> -struct Add +struct Subtract { + template + __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; + + template <> __host__ __device__ constexpr void - operator()(float& dst, const float& src1, const float& src2) const + operator()(float& y, const float& x0, const float& x1) const { - dst = src1 + src2; + y = x0 - x1; + }; + + template <> + __host__ __device__ constexpr void + operator()(double& y, const double& x0, const double& x1) const + { + y = x0 - x1; + }; + + // Question: should half_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(half_t& y, const half_t& x0, const half_t& x1) const + { + y = x0 - x1; + }; + + // Question: should bhalf_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const + { + const float x1_tmp = ck::type_convert(x0); + const float x2_tmp = ck::type_convert(x1); + const float y_tmp = x1_tmp - x2_tmp; + y = ck::type_convert(y_tmp); } }; -template <> -struct Add +struct AlphaBetaAdd { + AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; + + template <> __host__ __device__ constexpr void - operator()(half_t& dst, const half_t& src1, const half_t& src2) const + operator()(float& y, const float& x0, const float& x1) const { - dst = src1 + src2; - } + y = alpha_ * x0 + beta_ * x1; + }; + + template <> + __host__ __device__ constexpr void + operator()(double& y, const double& x0, const double& x1) const + { + y = static_cast(alpha_) * x0 + static_cast(beta_) * x1; + }; + + // Question: should half_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(half_t& y, const half_t& x0, const half_t& x1) const + { + y = static_cast(alpha_ * static_cast(x0) + beta_ * static_cast(x1)); + }; + + float alpha_; + float beta_; }; -template <> -struct Add +struct AddRelu { + template + __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; + + template <> __host__ __device__ constexpr void - operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const + operator()(float& y, const float& x0, const float& x1) const { - const float x1 = ck::type_convert(src1); - const float x2 = ck::type_convert(src2); - const float y = x1 + x2; - dst = ck::type_convert(y); - } + const float a = x0 + x1; + y = a > 0.0f ? a : 0.0f; + }; + + template <> + __host__ __device__ constexpr void + operator()(double& y, const double& x0, const double& x1) const + { + const double a = x0 + x1; + y = a > 0.0 ? a : 0.0; + }; + + // Question: should half_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(half_t& y, const half_t& x0, const half_t& x1) const + { + const half_t a = x0 + x1; + y = a > static_cast(0.0f) ? a : static_cast(0.0f); + }; }; -template -struct Substract; - -template <> -struct Substract +struct AddHardswish { + template + __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; + + template <> __host__ __device__ constexpr void - operator()(double& dst, const double& src1, const double& src2) const + operator()(float& y, const float& x0, const float& x1) const { - dst = src1 - src2; - } + float a = x0 + x1; + float b = a + float{3}; + float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f; + y = c; + }; + + template <> + __host__ __device__ constexpr void + operator()(double& y, const double& x0, const double& x1) const + { + double a = x0 + x1; + double b = a + 3.0; + double c = (b > 0) * (b > 6.0 ? 6.0 : b) * a * 0.166667; + y = c; + }; + + // Question: should half_t be supported ? + template <> + __host__ __device__ constexpr void + operator()(half_t& y, const half_t& x0, const half_t& x1) const + { + float a = x0 + x1; + float b = a + 3.0f; + float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f; + y = c; + }; }; -template <> -struct Substract -{ - __host__ __device__ constexpr void - operator()(float& dst, const float& src1, const float& src2) const - { - dst = src1 - src2; - } -}; +} // namespace element_wise -template <> -struct Substract -{ - __host__ __device__ constexpr void - operator()(half_t& dst, const half_t& src1, const half_t& src2) const - { - dst = src1 - src2; - } -}; - -template <> -struct Substract -{ - __host__ __device__ constexpr void - operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const - { - const float x1 = ck::type_convert(src1); - const float x2 = ck::type_convert(src2); - const float y = x1 - x2; - dst = ck::type_convert(y); - } -}; - -} // namespace binary_element_wise } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 596213e9e1..e4a2c7ac19 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -1,97 +1,13 @@ #pragma once #include "data_type.hpp" #include "math_v2.hpp" +#include "unary_element_wise_operation.hpp" +#include "binary_element_wise_operation.hpp" namespace ck { namespace tensor_operation { namespace element_wise { -struct PassThrough -{ - __host__ __device__ void operator()(float& y, const float& x) const { y = x; } - - __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; } - - __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { y = x; } - - __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; } - - __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; } - - __host__ __device__ void operator()(double& y, const double& x) const { y = x; } -}; - -struct Add -{ - __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const - { - y = x0 + x1; - } - - __host__ __device__ constexpr void - operator()(half_t& y, const half_t& x0, const half_t& x1) const - { - // FIXME - Use float (acc type) bias in the future. - y = x0 + x1; - } -}; - -struct AlphaBetaAdd -{ - AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {} - - __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const - { - y = alpha_ * x0 + beta_ * x1; - } - - __host__ __device__ constexpr void - operator()(half_t& y, const half_t& x0, const half_t& x1) const - { - // FIXME - Let x0 be acc type - y = static_cast(alpha_ * static_cast(x0) + beta_ * static_cast(x1)); - } - - float alpha_; - float beta_; -}; - -struct AddRelu -{ - __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const - { - const float a = x0 + x1; - y = a > 0 ? a : 0; - } - - __host__ __device__ constexpr void - operator()(half_t& y, const half_t& x0, const half_t& x1) const - { - const half_t a = x0 + x1; - y = a > 0 ? a : 0; - } -}; - -struct AddHardswish -{ - __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const - { - float a = x0 + x1; - float b = a + float{3}; - float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667}; - y = c; - } - - __host__ __device__ constexpr void - operator()(half_t& y, const half_t& x0, const half_t& x1) const - { - float a = x0 + x1; - float b = a + float{3}; - float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667}; - y = c; - } -}; - struct AddReluAdd { __host__ __device__ constexpr void @@ -167,204 +83,41 @@ struct Relu struct Normalize { - Normalize(float epsilon = 1e-4) : epsilon_(epsilon) {} + Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {} - __host__ __device__ constexpr void operator()(float& y, - const float& x, - const float& mean, - const float& mean_square, - const float& gamma, - const float& beta) const + template + __host__ __device__ constexpr void operator()( + T& y, const T& x, const T& mean, const T& mean_square, const T& gamma, const T& beta) const; + + template <> + __host__ __device__ constexpr void operator()(float& y, + const float& x, + const float& mean, + const float& mean_square, + const float& gamma, + const float& beta) const { + using ck::math::sqrt; + float variance = mean_square - (mean * mean); - y = ((x - mean) / sqrtf(variance + epsilon_)) * gamma + beta; - } - - float epsilon_; -}; - -// Unary operators are usually called element-wisely before/after the reduction is executed on the -// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 - -template -struct UnaryIdentic; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const { y = x; }; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const - { - y = x / type_convert(divider_); + y = ((x - mean) / sqrt(variance + static_cast(epsilon_))) * gamma + beta; }; - int32_t divider_ = 1; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; }; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const { y = x; }; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const + template <> + __host__ __device__ constexpr void operator()(double& y, + const double& x, + const double& mean, + const double& mean_square, + const double& gamma, + const double& beta) const { - y = x / type_convert(divider_); + using ck::math::sqrt; + + double variance = mean_square - (mean * mean); + y = ((x - mean) / sqrt(variance + epsilon_)) * gamma + beta; }; - int32_t divider_ = 1; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; - - __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; }; - - int32_t divider_ = 1; -}; - -template <> -struct UnaryIdentic -{ - __host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }; -}; - -template -struct UnarySquare; - -template <> -struct UnarySquare -{ - __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const { y = x * x; }; -}; - -template <> -struct UnarySquare -{ - __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const - { - y = x * x / type_convert(divider_); - }; - - int32_t divider_ = 1; -}; - -template <> -struct UnarySquare -{ - __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const { y = x * x; }; -}; - -template <> -struct UnarySquare -{ - __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const - { - y = x * x / type_convert(divider_); - }; - - int32_t divider_ = 1; -}; - -template -struct UnaryAbs; - -template <> -struct UnaryAbs -{ - __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::abs(x); }; -}; - -template <> -struct UnaryAbs -{ - __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = ck::math::abs(x); }; -}; - -template <> -struct UnaryAbs -{ - __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const { y = ck::math::abs(x); }; -}; - -template <> -struct UnaryAbs -{ - __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = ck::math::abs(x); }; -}; - -template -struct UnarySqrt; - -template <> -struct UnarySqrt -{ - __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::sqrt(x); }; -}; - -template <> -struct UnarySqrt -{ - __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; }; - - __host__ __device__ void operator()(double& y, const double& x) const - { - y = ck::math::sqrt(x); - }; + double epsilon_; }; template diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp new file mode 100644 index 0000000000..90c39e5c9a --- /dev/null +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -0,0 +1,80 @@ +#pragma once +#include "data_type.hpp" +#include "math_v2.hpp" + +namespace ck { +namespace tensor_operation { +namespace element_wise { + +struct PassThrough +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value, + "Data type is not supported by this operation!"); + + y = x; + }; +}; + +struct UnaryDivide +{ + __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider){}; + + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +struct UnarySquare +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value, + "Data type is not supported by this operation!"); + + y = x * x; + }; +}; + +struct UnaryAbs +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = ck::math::abs(x); + }; +}; + +struct UnarySqrt +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value, + "Data type is not supported by this operation!"); + + y = ck::math::sqrt(x); + }; +}; + +} // namespace element_wise +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp index b2f06c03c6..4206a91406 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp @@ -171,15 +171,15 @@ struct GridwiseReduction_mk_to_m_multiblock AccDataType beta, OutDataType* const __restrict__ p_out_value_global) { - const auto identityVal = ReduceOperation::GetIdentityValue(); + const auto identityVal = ReduceOperation::template GetIdentityValue(); // LDS __shared__ AccDataType p_reduce_work_buffer[BlockSize]; - const auto in_global_val_buf = - make_dynamic_buffer(p_in_value_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(identityVal)); + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); auto out_global_val_buf = make_dynamic_buffer( p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); @@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; - const auto identityVal = ReduceOperation::GetIdentityValue(); + const auto identityVal = ReduceOperation::template GetIdentityValue(); - const auto in_global_val_buf = - make_dynamic_buffer(p_in_value_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(identityVal)); + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); const auto in_global_idx_buf = make_dynamic_buffer( p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); auto out_global_val_buf = make_dynamic_buffer( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp index 074aafb9d4..d6e4bbd4cb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp @@ -135,12 +135,12 @@ struct GridwiseReduction_mk_to_m_threadwise ReduceOperation, PropagateNan>; - const auto identityVal = ReduceOperation::GetIdentityValue(); + const auto identityVal = ReduceOperation::template GetIdentityValue(); - const auto in_global_val_buf = - make_dynamic_buffer(p_in_value_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(identityVal)); + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); auto dst_global_buf = make_dynamic_buffer( p_out_value_global, out_grid_desc_m.GetElementSpaceSize()); @@ -276,12 +276,12 @@ struct GridwiseReduction_mk_to_m_threadwise (void)acc_elementwise_op; - const auto identityVal = ReduceOperation::GetIdentityValue(); + const auto identityVal = ReduceOperation::template GetIdentityValue(); - const auto in_global_val_buf = - make_dynamic_buffer(p_in_value_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(identityVal)); + const auto in_global_val_buf = make_dynamic_buffer( + p_in_value_global, + in_grid_desc_m_k.GetElementSpaceSize(), + ReduceOperation::template GetIdentityValue()); const auto in_global_idx_buf = make_dynamic_buffer( p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp index 5a3980541d..0b790d4e38 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp @@ -927,7 +927,8 @@ struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 false>; // Global write Gemm shuffle + reduction - const auto d_zeroVal = DReduceOperation::GetIdentityValue(); + const auto d_zeroVal = + DReduceOperation::template GetIdentityValue(); static_for<0, mreduce_per_thread, 1>{}( [&](auto I) { d_thread_buf(I) = d_zeroVal; }); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp index 0b09cd40e1..80a6eeace6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp @@ -816,7 +816,8 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 false>; // Global write Gemm shuffle + reduction - const auto d_identityVal = DReduceOperation::GetIdentityValue(); + const auto d_identityVal = + DReduceOperation::template GetIdentityValue(); static_for<0, mreduce_per_thread, 1>{}( [&](auto I) { d_thread_buf(I) = d_identityVal; }); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp index 6d95aec938..dcb45b6d5f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp @@ -37,7 +37,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe { - using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + using PassThroughOp = tensor_operation::element_wise::PassThrough; constexpr auto I0 = Number<0>{}; diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp index ee40398d25..eccdf932d7 100644 --- a/include/ck/utility/reduction_operator.hpp +++ b/include/ck/utility/reduction_operator.hpp @@ -28,6 +28,7 @@ #include "config.hpp" #include "data_type.hpp" +#include "type.hpp" namespace ck { @@ -54,64 +55,92 @@ namespace reduce { // accumulated index also need be // changed. -template struct Add { - using dataType = T; + template + __host__ __device__ static constexpr T GetIdentityValue() + { + return type_convert(0.0f); + }; - __host__ __device__ static constexpr T GetIdentityValue() { return static_cast(0.0f); }; - - __device__ static constexpr bool + __host__ __device__ static constexpr bool IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) { return operation == InMemoryDataOperationEnum::AtomicAdd || operation == InMemoryDataOperationEnum::Set; }; - __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } + template + __host__ __device__ inline constexpr void operator()(T& a, T b) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Add accumulator!"); + + a = a + b; + } }; -template struct Mul { - using dataType = T; + template + __host__ __device__ static constexpr T GetIdentityValue() + { + return type_convert(1.0f); + }; - __host__ __device__ static constexpr T GetIdentityValue() { return static_cast(1.0f); }; - - __device__ static constexpr bool + __host__ __device__ static constexpr bool IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) { return operation == InMemoryDataOperationEnum::Set; }; - __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } + template + __host__ __device__ inline constexpr void operator()(T& a, T b) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Mul accumulator!"); + + a = a * b; + } }; -template struct Max { - using dataType = T; - + template __host__ __device__ static constexpr T GetIdentityValue() { return NumericLimits::Lowest(); }; - __device__ static constexpr bool + __host__ __device__ static constexpr bool IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) { // ToChange: atomic_max to be added return operation == InMemoryDataOperationEnum::Set; }; + template __host__ __device__ inline constexpr void operator()(T& a, T b) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Max accumulator!"); + if(a < b) a = b; } + template __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Max accumulator!"); + if(a < b) { a = b; @@ -120,28 +149,41 @@ struct Max } }; -template struct Min { - using dataType = T; + template + __host__ __device__ static constexpr T GetIdentityValue() + { + return NumericLimits::Max(); + }; - __host__ __device__ static constexpr T GetIdentityValue() { return NumericLimits::Max(); }; - - __device__ static constexpr bool + __host__ __device__ static constexpr bool IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) { // ToChange: atomic_min to be added return operation == InMemoryDataOperationEnum::Set; }; + template __host__ __device__ inline constexpr void operator()(T& a, T b) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Min accumulator!"); + if(a > b) a = b; } + template __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the Min accumulator!"); + if(a > b) { a = b; @@ -150,28 +192,41 @@ struct Min } }; -template struct AMax { - using dataType = T; + template + __host__ __device__ static constexpr T GetIdentityValue() + { + return type_convert(0.0f); + }; - __host__ __device__ static constexpr T GetIdentityValue() { return static_cast(0.0f); }; - - __device__ static constexpr bool + __host__ __device__ static constexpr bool IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) { // ToChange: atomic_max to be added return operation == InMemoryDataOperationEnum::Set; }; + template __host__ __device__ inline constexpr void operator()(T& a, T b) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the AMax accumulator!"); + if(a < b) a = b; } + template __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, + "The data type is not supported by the AMax accumulator!"); + if(a < b) { a = b; @@ -181,7 +236,7 @@ struct AMax }; template -T GetIdentityValueueForInMemoryDataOperation(InMemoryDataOperationEnum operation) +constexpr T GetIdentityValueForInMemoryDataOperation(InMemoryDataOperationEnum operation) { T result = ck::type_convert(0.0f); @@ -191,6 +246,44 @@ T GetIdentityValueueForInMemoryDataOperation(InMemoryDataOperationEnum operation return (result); }; +template +struct InMemoryDataOperatonSupportedOnDataType +{ + static constexpr bool value = false; +}; + +template +struct InMemoryDataOperatonSupportedOnDataType +{ + static constexpr bool value = + is_same::value || is_same::value; +}; + +template +struct InMemoryDataOperatonSupportedOnDataType +{ + static constexpr bool value = + is_same::value || is_same::value; +}; + +template +struct InMemoryDataOperatonSupportedOnDataType +{ + static constexpr bool value = + is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value || is_same::value; +}; + +template +struct InMemoryDataOperatonSupportedOnDataType +{ + static constexpr bool value = + is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value; +}; + }; // end of namespace reduce } // end of namespace ck diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index 0e94095639..6c7162f067 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -174,15 +174,18 @@ struct ReductionHost const InDataType* in_data, float beta, OutDataType* out_data, - IndexDataType* out_indices) + IndexDataType* out_indices, + InElementwiseOperation in_elementwise_op, + AccElementwiseOperation acc_elementwise_op) { if constexpr(OutputIndex) { - RunImpl_with_index(alpha, in_data, beta, out_data, out_indices); + RunImpl_with_index( + alpha, in_data, beta, out_data, out_indices, in_elementwise_op, acc_elementwise_op); } else { - RunImpl_no_index(alpha, in_data, beta, out_data); + RunImpl_no_index(alpha, in_data, beta, out_data, in_elementwise_op, acc_elementwise_op); }; }; @@ -190,7 +193,9 @@ struct ReductionHost const InDataType* in_data, float beta, OutDataType* out_data, - IndexDataType* out_indices) + IndexDataType* out_indices, + InElementwiseOperation in_elementwise_op, + AccElementwiseOperation acc_elementwise_op) { using ck::float_equal_one; using ck::float_equal_zero; @@ -200,12 +205,10 @@ struct ReductionHost ReduceOperation, AccDataType, IndexDataType>; - InElementwiseOperation in_elementwise_op(divider); - AccElementwiseOperation acc_elementwise_op(divider); if constexpr(NumInvariantDim == 0) { - AccDataType accuVal = ReduceOperation::GetIdentityValue(); + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); IndexDataType accuIndex = 0; for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) @@ -236,7 +239,7 @@ struct ReductionHost else { auto thread_reduce_func = [&](auto invariant_index) { - AccDataType accuVal = ReduceOperation::GetIdentityValue(); + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); IndexDataType accuIndex = 0; auto offset_invariant = @@ -297,7 +300,12 @@ struct ReductionHost }; }; - void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data) + void RunImpl_no_index(float alpha, + const InDataType* in_data, + float beta, + OutDataType* out_data, + InElementwiseOperation in_elementwise_op, + AccElementwiseOperation acc_elementwise_op) { using ck::float_equal_one; using ck::float_equal_zero; @@ -306,12 +314,9 @@ struct ReductionHost using Accumulation = ck::detail::AccumulateWithNanCheck; - InElementwiseOperation in_elementwise_op(divider); - AccElementwiseOperation acc_elementwise_op(divider); - if constexpr(NumInvariantDim == 0) { - AccDataType accuVal = ReduceOperation::GetIdentityValue(); + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); for(const auto& reduce_index : reduce_dim_indexes) { @@ -338,7 +343,7 @@ struct ReductionHost else { auto thread_reduce_func = [&](auto invariant_index) { - AccDataType accuVal = ReduceOperation::GetIdentityValue(); + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); auto offset_invariant = get_offset_from_index(invariantStrides, invariant_index); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp index 45fc8b8503..11252e2398 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp @@ -106,9 +106,8 @@ struct ReferenceConvBwdData : public device::BaseOperator } } - float v_in; - arg.in_element_op_(v_in, v_acc); - arg.input_(n, c, wi) = ck::type_convert(v_in); + arg.in_element_op_(v_acc, v_acc); + arg.input_(n, c, wi) = ck::type_convert(v_acc); }; make_ParallelTensorFunctor(f_ncw, diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp index 3e7f220e03..5003965b0e 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp @@ -66,8 +66,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator for(int k = 0; k < K; ++k) { - arg.a_element_op_(a, arg.a_m_k_(m, k)); - arg.b_element_op_(b, arg.b_k_n_(k, n)); + arg.a_element_op_(a, static_cast(arg.a_m_k_(m, k))); + arg.b_element_op_(b, static_cast(arg.b_k_n_(k, n))); acc += a * b; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index e31d4e769e..0f8c365007 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -61,10 +61,10 @@ using reduce_configuration_2_instances_blockwise = std::tuple< >; #endif -template +template using deviceReduceBlockWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; template void add_device_reduce_instance_blockwise( - std::vector>& device_op_instances) + std::vector>& device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; + using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || @@ -137,7 +136,7 @@ void add_device_reduce_instance_blockwise( ReduceOpId, \ PropagateNan, \ UseIndex>( \ - std::vector> & device_op_instances) + std::vector> & device_op_instances) #define ADD_BLOCKWISE_INST_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ @@ -150,21 +149,17 @@ void add_device_reduce_instance_blockwise( Rank, \ NumReduceDim) -#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_blockwise( \ - std::vector::InElementwiseOperation, \ - typename reduce_unary_operator:: \ - AccElementwiseOperation>> & \ - device_op_instances) +#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ + extern template void add_device_reduce_instance_blockwise( \ + std::vector> & device_op_instances) #define ADD_BLOCKWISE_INST_REF_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index 605109d077..9f78933bde 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -61,12 +61,10 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< >; #endif -template -using deviceReduceMultiBlockAtomicAddPtrType = - DeviceReducePtr:: - InElementwiseOperation, - typename reduce_unary_operator:: - AccElementwiseOperation>; +template +using deviceReduceMultiBlockAtomicAddPtrType = DeviceReducePtr< + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; template void add_device_reduce_instance_multiblock_atomic_add( - std::vector>& - device_op_instances) + std::vector>& device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; + using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || @@ -158,8 +154,7 @@ void add_device_reduce_instance_multiblock_atomic_add( ReduceOpId, \ PropagateNan, \ UseIndex>( \ - std::vector> & \ - device_op_instances) + std::vector> & device_op_instances) #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ @@ -172,21 +167,17 @@ void add_device_reduce_instance_multiblock_atomic_add( Rank, \ NumReduceDim) -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_multiblock_atomic_add( \ - std::vector::InElementwiseOperation, \ - typename reduce_unary_operator:: \ - AccElementwiseOperation>> & \ - device_op_instances) +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ + extern template void add_device_reduce_instance_multiblock_atomic_add( \ + std::vector> & device_op_instances) #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index a2b4ae22be..563dd09b10 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -47,10 +47,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple< >; #endif -template +template using deviceReduceThreadWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; template void add_device_reduce_instance_threadwise( - std::vector>& device_op_instances) + std::vector>& device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; + using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; constexpr bool Indexable = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || @@ -114,7 +113,7 @@ void add_device_reduce_instance_threadwise( ReduceOpId, \ PropagateNan, \ UseIndex>( \ - std::vector> & device_op_instances) + std::vector> & device_op_instances) #define ADD_THREADWISE_INST_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ @@ -127,21 +126,17 @@ void add_device_reduce_instance_threadwise( Rank, \ NumReduceDim) -#define ADD_THREADWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_threadwise( \ - std::vector::InElementwiseOperation, \ - typename reduce_unary_operator:: \ - AccElementwiseOperation>> & \ - device_op_instances) +#define ADD_THREADWISE_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ + extern template void add_device_reduce_instance_threadwise( \ + std::vector> & device_op_instances) #define ADD_THREADWISE_INST_REF_BY_ID( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp index 466431b5be..886863c73b 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp @@ -21,11 +21,11 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp index 57339526dd..b5ddc43838 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp @@ -21,11 +21,11 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp index ac08f6b225..8426ab79c9 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp @@ -21,11 +21,11 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp index 3dce82c228..7cd1908803 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp @@ -21,11 +21,11 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp index da4ff0c214..2e1a7f531c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp index 45100ab905..db6140ea61 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp index 5a39acc5a7..050473886f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp index a6b378ca00..c50e6cf83d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp index fe96268811..e1d2f2f6ff 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp index 4121bbb394..81509a3fc5 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp index cb23620d50..4d13381d45 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp index 6c772b5198..459d0cd473 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp @@ -21,12 +21,12 @@ template using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ReduceSum = ck::reduce::Add; +using ReduceSum = ck::reduce::Add; using ReduceOps = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index 010e9a45cc..d1737f588a 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -20,8 +20,8 @@ namespace device_gemm_instance { using F32 = float; using F16 = ck::half_t; using DPtrsGlobal = ck::Tuple; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; @@ -128,17 +128,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification, b_g_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); } - using AElementOp = ck::tensor_operation::element_wise::PassThrough; - using BElementOp = ck::tensor_operation::element_wise::PassThrough; - using CElementOp = ck::tensor_operation::element_wise::PassThrough; - using D0ReduceOp = ck::reduce::Add; - using D1ReduceOp = ck::reduce::Add; - using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; - using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; - using DxsInElementOps = ck::Tuple; - using DxsOutElementOps = ck::Tuple; + using AElementOp = ck::tensor_operation::element_wise::PassThrough; + using BElementOp = ck::tensor_operation::element_wise::PassThrough; + using CElementOp = ck::tensor_operation::element_wise::PassThrough; + using D0ReduceOp = ck::reduce::Add; + using D1ReduceOp = ck::reduce::Add; + using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; + using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; + using DxsInElementOps = ck::Tuple; + using DxsOutElementOps = ck::Tuple; const auto a_element_op = AElementOp{}; const auto b_element_op = BElementOp{}; @@ -170,8 +168,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification, { for(int m = 0; m < M; ++m) { - float d0_acc = d0_reduce_op.GetIdentityValue(); - float d1_acc = d1_reduce_op.GetIdentityValue(); + float d0_acc = d0_reduce_op.GetIdentityValue(); + float d1_acc = d1_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) { diff --git a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp index c2837fefeb..5b792219c0 100644 --- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp +++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp @@ -20,9 +20,9 @@ namespace device_gemm_instance { using F32 = float; using F16 = ck::half_t; using DPtrsGlobal = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; @@ -136,20 +136,18 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, c1_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); } - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using AElementOp = PassThrough; - using BElementOp = PassThrough; - using CElementOp = PassThrough; - using C1ElementOp = PassThrough; - using D0ReduceOp = ck::reduce::Add; - using D1ReduceOp = ck::reduce::Add; - using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryIdentic; - using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; - using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; - using DxsInElementOps = ck::Tuple; - using DxsOutElementOps = ck::Tuple; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using AElementOp = PassThrough; + using BElementOp = PassThrough; + using CElementOp = PassThrough; + using C1ElementOp = PassThrough; + using D0ReduceOp = ck::reduce::Add; + using D1ReduceOp = ck::reduce::Add; + using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; + using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; + using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; + using DxsInElementOps = ck::Tuple; + using DxsOutElementOps = ck::Tuple; const auto a_element_op = AElementOp{}; const auto b_element_op = BElementOp{}; @@ -196,15 +194,15 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, for(int m = 0; m < M; ++m) { - ReduceAccDataType d0_acc = d0_reduce_op.GetIdentityValue(); - ReduceAccDataType d1_acc = d1_reduce_op.GetIdentityValue(); + auto d0_acc = d0_reduce_op.GetIdentityValue(); + auto d1_acc = d1_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) { ReduceAccDataType c_val = ck::type_convert(c_m_n_host_result(m, n)); - ReduceAccDataType d0_val = 0; - ReduceAccDataType d1_val = 0; + ReduceAccDataType d0_val; + ReduceAccDataType d1_val; dxs_in_element_op(ck::Number<0>{})(d0_val, c_val); dxs_in_element_op(ck::Number<1>{})(d1_val, c_val); diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index a70dc837ed..97c23defe0 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -20,9 +20,9 @@ namespace device_gemm_instance { using F32 = float; using F16 = ck::half_t; using DPtrsGlobal = ck::Tuple; -using Div = ck::tensor_operation::element_wise::UnaryIdentic; -using Identity = ck::tensor_operation::element_wise::UnaryIdentic; -using Square = ck::tensor_operation::element_wise::UnarySquare; +using Div = ck::tensor_operation::element_wise::UnaryDivide; +using Identity = ck::tensor_operation::element_wise::PassThrough; +using Square = ck::tensor_operation::element_wise::UnarySquare; using DInElementOps = ck::Tuple; using DOutElementOps = ck::Tuple; @@ -123,18 +123,16 @@ bool profile_gemm_reduce_impl(int do_verification, b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); } - using AElementOp = ck::tensor_operation::element_wise::PassThrough; - using BElementOp = ck::tensor_operation::element_wise::PassThrough; - using CElementOp = ck::tensor_operation::element_wise::PassThrough; - using D0ReduceOp = ck::reduce::Add; - using D1ReduceOp = ck::reduce::Add; - using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryIdentic; - using UnaryIdenticElementOp = - ck::tensor_operation::element_wise::UnaryIdentic; - using UnarySquareElementOp = - ck::tensor_operation::element_wise::UnarySquare; - using DxsInElementOps = ck::Tuple; - using DxsOutElementOps = ck::Tuple; + using AElementOp = ck::tensor_operation::element_wise::PassThrough; + using BElementOp = ck::tensor_operation::element_wise::PassThrough; + using CElementOp = ck::tensor_operation::element_wise::PassThrough; + using D0ReduceOp = ck::reduce::Add; + using D1ReduceOp = ck::reduce::Add; + using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; + using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; + using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; + using DxsInElementOps = ck::Tuple; + using DxsOutElementOps = ck::Tuple; const auto a_element_op = AElementOp{}; const auto b_element_op = BElementOp{}; @@ -167,15 +165,15 @@ bool profile_gemm_reduce_impl(int do_verification, for(int m = 0; m < M; ++m) { - ReduceAccDataType d0_acc = d0_reduce_op.GetIdentityValue(); - ReduceAccDataType d1_acc = d1_reduce_op.GetIdentityValue(); + auto d0_acc = d0_reduce_op.GetIdentityValue(); + auto d1_acc = d1_reduce_op.GetIdentityValue(); for(int n = 0; n < N; ++n) { ReduceAccDataType c_val = ck::type_convert(c_m_n_host_result(m, n)); - ReduceAccDataType d0_val = 0; - ReduceAccDataType d1_val = 0; + ReduceAccDataType d0_val; + ReduceAccDataType d1_val; dxs_in_element_op(ck::Number<0>{})(d0_val, c_val); dxs_in_element_op(ck::Number<1>{})(d1_val, c_val); diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index fd519d1033..5e192aa1bc 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -261,13 +261,18 @@ bool profile_reduce_impl_impl(bool do_verification, float best_gb_per_sec = 0; using InElementwiseOperation = - typename reduce_unary_operator:: - InElementwiseOperation; + typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename reduce_unary_operator:: - AccElementwiseOperation; + typename reduce_unary_operator::AccElementwiseOperation; - using ReduceOperation = typename reduce_binary_operator::opType; + using ReduceOperation = typename reduce_binary_operator::opType; + + InElementwiseOperation in_elementwise_op; + AccElementwiseOperation acc_elementwise_op; + + std::tie(in_elementwise_op, acc_elementwise_op) = + reduce_unary_operator::GetElementwiseOperator( + static_cast(reduce_total_length)); using DeviceReduceInstPtr0 = DeviceReducePtr; @@ -323,8 +328,13 @@ bool profile_reduce_impl_impl(bool do_verification, OutputIndex> hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run( - alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); + hostReduce.Run(alpha, + in.mData.data(), + beta, + out_ref.mData.data(), + out_indices_ref.mData.data(), + in_elementwise_op, + acc_elementwise_op); }; std::vector i_inLengths; @@ -339,10 +349,6 @@ bool profile_reduce_impl_impl(bool do_verification, for(auto& reduce_ptr : reduce0_ptrs) { - - InElementwiseOperation in_elementwise_op(static_cast(reduce_total_length)); - AccElementwiseOperation acc_elementwise_op(static_cast(reduce_total_length)); - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, i_inStrides, i_outLengths,