diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md index fca8205ca6..20e1b5aa6a 100644 --- a/example/12_reduce/README.md +++ b/example/12_reduce/README.md @@ -37,7 +37,7 @@ cmake \ ```bash # -D : input 4-d tensor lengths # -v : verification (0=no, 1=yes) -#arg1: initialization (0=no init, 1=integer value, 2=decimal value) +#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg2: run kernel # of times (>1) ./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10 ``` diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 6a5864ede0..e41a961103 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -13,7 +13,7 @@ #include "device_base.hpp" #include "device_reduce_blockwise.hpp" #include "host_reduce_util.hpp" -#include "host_generic_reduction.hpp" +#include "host_reduction.hpp" #include "reduction_enums.hpp" #include "reduction_operator_mapping.hpp" @@ -21,13 +21,13 @@ using namespace ck; using namespace ck::tensor_operation::device; -using InDataType = half_float::half; -using OutDataType = half_float::half; +using InDataType = ck::half_t; +using OutDataType = ck::half_t; using AccDataType = float; -using kInDataType = ck::half_t; -using kOutDataType = ck::half_t; -using kAccDataType = float; +using HostInDataType = half_float::half; +using HostOutDataType = half_float::half; +using HostAccDataType = float; constexpr int Rank = 4; constexpr int NumReduceDim = 3; @@ -43,9 +43,9 @@ using InElementwiseOperation = using AccElementwiseOperation = typename reduce_unary_operator::AccElementwiseOperation; -using DeviceReduceInstance = DeviceReduceBlockWise{}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; + case 0: break; case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); if(beta != 0.0f) out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); break; default: - in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); } if(beta != 0.0f) @@ -293,17 +298,27 @@ int main(int argc, char* argv[]) if(beta != 0.0f) out_dev.ToDevice(out.mData.data()); - size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; + size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; DeviceMem out_indices_dev(indicesSizeInBytes); if(args.do_verification) { - ReductionHost + ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run( - alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); + hostReduce.Run(alpha, + reinterpret_cast(in.mData.data()), + beta, + reinterpret_cast(out_ref.mData.data()), + out_indices_ref.mData.data()); }; const auto i_inLengths = to_int_vector(args.inLengths); @@ -313,7 +328,7 @@ int main(int argc, char* argv[]) auto reduce = DeviceReduceInstance{}; - auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths); + auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); DeviceMem ws_dev(wsSizeInBytes); diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md index 1f8cc4cfbd..4b994e7382 100644 --- a/example/13_pool2d_fwd/README.md +++ b/example/13_pool2d_fwd/README.md @@ -36,7 +36,7 @@ cmake \ ## Run ```pool2d_fwd``` ```bash #arg1: verification (0=no, 1=yes) -#arg2: initialization (0=no init, 1=integer value, 2=decimal value) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg3: run kernel # of times (>1) #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx ./example/pool2d_fwd 1 1 10 diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index a0cb61136f..0b4aba3af1 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -236,8 +236,9 @@ int main(int argc, char* argv[]) switch(init_method) { case 0: break; - case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; - default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1{1}); break; + case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; + default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}); } DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp index 11fd58a2ff..50fa64dab8 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp @@ -16,9 +16,11 @@ namespace device { template struct DeviceReduce : public BaseOperator { - virtual size_t GetWorkspaceSizeInBytes(const std::vector& inLengths) + virtual long_index_t GetWorkspaceSizeInBytes(const std::vector inLengths, + const std::vector reduceDims) { (void)inLengths; + (void)reduceDims; return (0); }; @@ -32,19 +34,19 @@ struct DeviceReduce : public BaseOperator }; virtual std::unique_ptr - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) = 0; + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp index cc1919ab81..4f17989b53 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp @@ -36,20 +36,20 @@ struct DeviceReduceBlockWise : public DeviceReduce, - typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - static constexpr index_t srcDims = Rank; - static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); - static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + static constexpr index_t numSrcDim = Rank; + static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; @@ -57,18 +57,18 @@ struct DeviceReduceBlockWise : public DeviceReduce& inLengths, const std::vector& inStrides) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDims) + if constexpr(reduceAllDim) { const auto one_dim_inDesc = transform_tensor_descriptor( inDesc, make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -79,6 +79,9 @@ struct DeviceReduceBlockWise : public DeviceReduce::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + const auto reduceDimLengths = make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); const auto invariantDimLengths = @@ -93,18 +96,20 @@ struct DeviceReduceBlockWise : public DeviceReduce{}); - const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; - const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = + math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; - auto in_grid_desc_m_k_padded = - transform_tensor_descriptor(in_grid_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, inPad_M), - make_right_pad_transform(innerLen, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); return (in_grid_desc_m_k_padded); }; @@ -112,44 +117,45 @@ struct DeviceReduceBlockWise : public DeviceReduce& outLengths, const std::vector& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), make_tuple(Sequence<0>{})); - const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - const auto inPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - auto out_grid_desc_m_padded = - transform_tensor_descriptor(out_grid_desc_m, - make_tuple(make_right_pad_transform(outerLen, inPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, inPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); return (out_grid_desc_m_padded); }; struct Argument : public BaseArgument { - Argument(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, OutDataType* out_dev, IndexDataType* out_indices_dev, AccDataType* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, @@ -160,21 +166,21 @@ struct DeviceReduceBlockWise : public DeviceReduce(inLengths, inStrides, reduceDims); + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - alpha_ = static_cast(alpha); - beta_ = static_cast(beta); + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); + get_2d_lengths(inLengths_); - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) invariant_lowest_length = 1; else - invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)]; + invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)]; + reduce_lowest_length = inLengths_[Rank - 1]; gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / M_BlockTileSize; @@ -186,7 +192,7 @@ struct DeviceReduceBlockWise : public DeviceReduce outStrides_; AccDataType alpha_; - OutDataType beta_; + AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; @@ -278,18 +284,22 @@ struct DeviceReduceBlockWise : public DeviceReduceinStrides_[NumInvariantDim - 1] != 1) + return (false); - if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + }; } else { - if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + if(pArg->inStrides_[Rank - 1] != 1) return (false); if(pArg->reduce_lowest_length % InSrcVectorSize != 0) @@ -308,19 +318,19 @@ struct DeviceReduceBlockWise : public DeviceReduce - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) override + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) override { return std::make_unique(inLengths, inStrides, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp index 1647b3d84c..d3b1b4b5c3 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp @@ -37,6 +37,10 @@ struct DeviceReduceBlockWiseSecondCall static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid thread cluster size assignments!"); + static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + using IndexDataType = int32_t; static constexpr bool BetaIsZero = NeedIndices; @@ -46,12 +50,8 @@ struct DeviceReduceBlockWiseSecondCall "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"); static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - using InvariantDims = - typename conditional, - typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type; - static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; @@ -65,18 +65,20 @@ struct DeviceReduceBlockWiseSecondCall const auto in_grid_desc_m_k = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; - const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = + math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; - auto in_grid_desc_m_k_padded = - transform_tensor_descriptor(in_grid_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, inPad_M), - make_right_pad_transform(innerLen, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); return (in_grid_desc_m_k_padded); }; @@ -84,26 +86,27 @@ struct DeviceReduceBlockWiseSecondCall static auto MakeDst1dDescriptor(const std::vector& outLengths, const std::vector& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), make_tuple(Sequence<0>{})); - const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto outPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - auto out_grid_desc_m_padded = - transform_tensor_descriptor(out_grid_desc_m, - make_tuple(make_right_pad_transform(outerLen, outPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); return (out_grid_desc_m_padded); }; @@ -131,8 +134,8 @@ struct DeviceReduceBlockWiseSecondCall in_elementwise_op_(in_elementwise_op), acc_elementwise_op_(acc_elementwise_op) { - alpha_ = static_cast(alpha); - beta_ = static_cast(beta); + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); invariant_total_length = inLengths[0]; reduce_total_length = inLengths[1]; @@ -159,7 +162,7 @@ struct DeviceReduceBlockWiseSecondCall std::vector outStrides_; AccDataType alpha_; - OutDataType beta_; + AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; @@ -268,19 +271,19 @@ struct DeviceReduceBlockWiseSecondCall }; std::unique_ptr - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) override + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) override { (void)reduceDims; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp index 85e0eb1197..038c754722 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp @@ -12,38 +12,30 @@ namespace ck { namespace tensor_operation { namespace device { -// template -// using DeviceReducePtr = std::unique_ptr>; - -template +// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those +// of reduce dims +template std::pair get_2d_lengths(const std::vector& inLengths) { static_assert(Rank <= 6, "bigger Rank size not supported!"); - size_t tensor_total_length = 1; - size_t reduce_total_length = 1; + size_t invariant_total_length = 1; + size_t reduce_total_length = 1; - static_for<0, ReduceDims::Size(), 1>{}( - [&](auto i) { reduce_total_length *= inLengths[ReduceDims::At(i)]; }); + constexpr int NumInvariantDim = Rank - NumReduceDim; - static_for<0, Rank, 1>{}([&](auto i) { tensor_total_length *= inLengths[i.value]; }); + for(int i = NumInvariantDim; i < Rank; i++) + reduce_total_length *= inLengths[i]; - return std::make_pair(tensor_total_length / reduce_total_length, reduce_total_length); -}; + for(int i = 0; i < NumInvariantDim; i++) + invariant_total_length *= inLengths[i]; -template -constexpr bool belong() -{ - bool inside = false; - - static_for<0, Seq::Size(), 1>{}([&](auto i) { inside = (inside || (x == Seq::At(i))); }); - - return (inside); + return std::make_pair(invariant_total_length, reduce_total_length); }; // helper functions using variadic template arguments template -static auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) +auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) { return make_tuple(static_cast(lengths[Ns])...); }; @@ -59,16 +51,12 @@ static auto make_tuple_from_array(const std::vector& lengths, Number -static inline std::pair, std::vector> -shuffle_tensor_dimensions(const std::vector& dimLengths, - const std::vector& dimStrides, - const std::vector& reduceDims) +std::vector shuffle_tensor_dimensions(const std::vector& origLengthsStrides, + const std::vector& reduceDims) { - std::vector newDimLengths; - std::vector newDimStrides; + std::vector newLengthsStrides; - assert(Rank == dimLengths.size() && Rank == dimStrides.size() && - NumReduceDim == reduceDims.size()); + assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size()); int reduceFlag = 0; @@ -82,19 +70,17 @@ shuffle_tensor_dimensions(const std::vector& dimLengths, for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) == 0) { - newDimLengths.push_back(dimLengths[i]); - newDimStrides.push_back(dimStrides[i]); + newLengthsStrides.push_back(origLengthsStrides[i]); }; // collect reduce dimensions for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) > 0) { - newDimLengths.push_back(dimLengths[i]); - newDimStrides.push_back(dimStrides[i]); + newLengthsStrides.push_back(origLengthsStrides[i]); }; - return std::make_pair(newDimLengths, newDimStrides); + return newLengthsStrides; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp index 5bf3c1d7d1..889c366875 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp @@ -39,18 +39,18 @@ struct DeviceReduceMultiBlockAtomicAdd static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid thread cluster size assignments!"); + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + using IndexDataType = int32_t; static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - using InvariantDims = - typename conditional, - typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - static constexpr index_t srcDims = Rank; - static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); - static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + static constexpr index_t numSrcDim = Rank; + static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); static constexpr bool support_AtomicAdd = std::is_same::value || std::is_same::value; @@ -67,18 +67,18 @@ struct DeviceReduceMultiBlockAtomicAdd int blkGroupSize, int kBlockTileIterations) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDims) + if constexpr(reduceAllDim) { const auto one_dim_inDesc = transform_tensor_descriptor( inDesc, make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -89,6 +89,9 @@ struct DeviceReduceMultiBlockAtomicAdd } else { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + const auto reduceDimLengths = make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); const auto invariantDimLengths = @@ -103,19 +106,20 @@ struct DeviceReduceMultiBlockAtomicAdd } }(); - const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; - const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; - const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; - auto in_grid_desc_m_k_padded = - transform_tensor_descriptor(in_grid_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, inPad_M), - make_right_pad_transform(innerLen, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); return (in_grid_desc_m_k_padded); }; @@ -123,44 +127,45 @@ struct DeviceReduceMultiBlockAtomicAdd static auto MakeDst1dDescriptor(const std::vector& outLengths, const std::vector& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), make_tuple(Sequence<0>{})); - const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto outPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - auto out_grid_desc_m_padded = - transform_tensor_descriptor(out_grid_desc_m, - make_tuple(make_right_pad_transform(outerLen, outPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); return (out_grid_desc_m_padded); }; struct Argument : public BaseArgument { - Argument(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, OutDataType* out_dev, IndexDataType* out_indices_dev, AccDataType* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, @@ -171,21 +176,21 @@ struct DeviceReduceMultiBlockAtomicAdd (void)out_indices_dev; (void)workspace_dev; - std::tie(inLengths_, inStrides_) = - shuffle_tensor_dimensions(inLengths, inStrides, reduceDims); + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - alpha_ = static_cast(alpha); - beta_ = static_cast(beta); + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); + get_2d_lengths(inLengths_); - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) invariant_lowest_length = 1; else - invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)]; + invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)]; + reduce_lowest_length = inLengths_[Rank - 1]; int iterations = 1; while(true) @@ -218,7 +223,7 @@ struct DeviceReduceMultiBlockAtomicAdd std::vector outStrides_; AccDataType alpha_; - OutDataType beta_; + AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; @@ -334,18 +339,22 @@ struct DeviceReduceMultiBlockAtomicAdd if constexpr(InSrcVectorDim == 0) { - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) + { return (false); + } + else + { + if(pArg->inStrides_[NumInvariantDim - 1] != 1) + return (false); - if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + }; } else { - if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + if(pArg->inStrides_[Rank - 1] != 1) return (false); if(pArg->reduce_lowest_length % InSrcVectorSize != 0) @@ -371,19 +380,19 @@ struct DeviceReduceMultiBlockAtomicAdd }; std::unique_ptr - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) override + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) override { return std::make_unique(inLengths, inStrides, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp index 5b69afa5d8..d583f7f1b8 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp @@ -37,31 +37,35 @@ struct DeviceReduceMultiBlockPartialReduce static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid thread cluster size assignments!"); + static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!"); using IndexDataType = int32_t; static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - using InvariantDims = - typename conditional, - typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - static constexpr index_t srcDims = Rank; - static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); - static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + static constexpr index_t numSrcDim = Rank; + static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - size_t GetWorkspaceSizeInBytes(const std::vector& inLengths) override + static constexpr int MaxBlockGroupSize = 256; + + long_index_t GetWorkspaceSizeInBytes(const std::vector inLengths, + const std::vector reduceDims) override { size_t invariant_total_length; size_t reduce_total_length; + auto inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths); + get_2d_lengths(inLengths_); int iterations = 1; while(true) @@ -69,8 +73,7 @@ struct DeviceReduceMultiBlockPartialReduce int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / (K_BlockTileSize * iterations); - // we want the blkGroupSize be not more than 128 - if(testBlkGroupSize <= 128) + if(testBlkGroupSize <= MaxBlockGroupSize) break; iterations++; @@ -79,11 +82,12 @@ struct DeviceReduceMultiBlockPartialReduce int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / (K_BlockTileSize * iterations); - size_t workspace_size = invariant_total_length * blkGroupSize; + long_index_t workspace_size = invariant_total_length * blkGroupSize; - size_t wsSizeInBytes = - !NeedIndices ? workspace_size * sizeof(AccDataType) - : workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int); + long_index_t wsSizeInBytes = + !NeedIndices + ? workspace_size * sizeof(AccDataType) + : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int); return (wsSizeInBytes); }; @@ -95,18 +99,18 @@ struct DeviceReduceMultiBlockPartialReduce int blkGroupSize, int kBlockTileIterations) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDims) + if constexpr(reduceAllDim) { const auto one_dim_inDesc = transform_tensor_descriptor( inDesc, make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -117,6 +121,9 @@ struct DeviceReduceMultiBlockPartialReduce } else { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + const auto reduceDimLengths = make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); const auto invariantDimLengths = @@ -131,32 +138,35 @@ struct DeviceReduceMultiBlockPartialReduce } }(); - const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; - const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; - const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; - auto in_grid_desc_m_k_padded = - transform_tensor_descriptor(in_grid_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, inPad_M), - make_right_pad_transform(innerLen, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); return (in_grid_desc_m_k_padded); }; - static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize) + static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize) { - auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize)); + auto ws_desc_m_k = + make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize)); - const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto wsPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; auto ws_desc_m_k_padded = transform_tensor_descriptor(ws_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, wsPad), + make_tuple(make_right_pad_transform(invariantLength, wsPad), make_pass_through_transform(blkGroupSize)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); @@ -166,19 +176,19 @@ struct DeviceReduceMultiBlockPartialReduce struct Argument : public BaseArgument { - Argument(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, OutDataType* out_dev, IndexDataType* out_indices_dev, AccDataType* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, @@ -188,21 +198,21 @@ struct DeviceReduceMultiBlockPartialReduce in_elementwise_op_{in_elementwise_op}, acc_elementwise_op_{acc_elementwise_op} { - std::tie(inLengths_, inStrides_) = - shuffle_tensor_dimensions(inLengths, inStrides, reduceDims); + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - alpha_ = static_cast(alpha); - beta_ = static_cast(beta); + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); + get_2d_lengths(inLengths_); - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) invariant_lowest_length = 1; else - invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)]; + invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)]; + reduce_lowest_length = inLengths_[Rank - 1]; int iterations = 1; while(true) @@ -210,8 +220,7 @@ struct DeviceReduceMultiBlockPartialReduce int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / (K_BlockTileSize * iterations); - // we want the blkGroupSize be not more than 128 - if(testBlkGroupSize <= 128) + if(testBlkGroupSize <= MaxBlockGroupSize) break; iterations++; @@ -241,7 +250,7 @@ struct DeviceReduceMultiBlockPartialReduce std::vector outStrides_; AccDataType alpha_; - OutDataType beta_; + AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; @@ -337,18 +346,22 @@ struct DeviceReduceMultiBlockPartialReduce if constexpr(InSrcVectorDim == 0) { - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) + { return (false); + } + else + { + if(pArg->inStrides_[NumInvariantDim - 1] != 1) + return (false); - if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + }; } else { - if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + if(pArg->inStrides_[Rank - 1] != 1) return (false); if(pArg->reduce_lowest_length % InSrcVectorSize != 0) @@ -371,19 +384,19 @@ struct DeviceReduceMultiBlockPartialReduce }; std::unique_ptr - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const AccElementwiseOperation& acc_elementwise_op) override + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) override { return std::make_unique(inLengths, inStrides, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp index e975a10d71..bf4088a96b 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp @@ -36,20 +36,20 @@ struct DeviceReduceThreadWise : public DeviceReduce, - typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - static constexpr index_t srcDims = Rank; - static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); - static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + static constexpr index_t numSrcDim = Rank; + static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; @@ -57,18 +57,18 @@ struct DeviceReduceThreadWise : public DeviceReduce& inLengths, const std::vector& inStrides) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDims) + if constexpr(reduceAllDim) { const auto one_dim_inDesc = transform_tensor_descriptor( inDesc, make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -79,6 +79,9 @@ struct DeviceReduceThreadWise : public DeviceReduce::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + const auto reduceDimLengths = make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); const auto invariantDimLengths = @@ -93,18 +96,20 @@ struct DeviceReduceThreadWise : public DeviceReduce{}); - const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; - const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = + math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength; - auto in_grid_desc_m_k_padded = - transform_tensor_descriptor(in_grid_desc_m_k, - make_tuple(make_right_pad_transform(outerLen, inPad_M), - make_right_pad_transform(innerLen, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); return (in_grid_desc_m_k_padded); }; @@ -112,44 +117,45 @@ struct DeviceReduceThreadWise : public DeviceReduce& outLengths, const std::vector& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), make_tuple(Sequence<0>{})); - const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); - const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto outPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - auto out_grid_desc_m_padded = - transform_tensor_descriptor(out_grid_desc_m, - make_tuple(make_right_pad_transform(outerLen, outPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); + auto out_grid_desc_m_padded = transform_tensor_descriptor( + out_grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); return (out_grid_desc_m_padded); }; struct Argument : public BaseArgument { - Argument(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const InDataType* in_dev, OutDataType* out_dev, IndexDataType* out_indices_dev, AccDataType* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const OutElementwiseOperation& acc_elementwise_op) + const InElementwiseOperation in_elementwise_op, + const OutElementwiseOperation acc_elementwise_op) : outLengths_{outLengths}, outStrides_{outStrides}, in_dev_{in_dev}, @@ -161,21 +167,21 @@ struct DeviceReduceThreadWise : public DeviceReduce(inLengths, inStrides, reduceDims); + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); + inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); - alpha_ = static_cast(alpha); - beta_ = static_cast(beta); + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(inLengths_); + get_2d_lengths(inLengths_); - if constexpr(InvariantDims::Size() == 0) + if constexpr(NumInvariantDim == 0) invariant_lowest_length = 1; else - invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)]; + invariant_lowest_length = inLengths_[NumInvariantDim - 1]; - reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)]; + reduce_lowest_length = inLengths_[Rank - 1]; gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / M_BlockTileSize; @@ -187,7 +193,7 @@ struct DeviceReduceThreadWise : public DeviceReduce outStrides_; AccDataType alpha_; - OutDataType beta_; + AccDataType beta_; const InDataType* in_dev_; OutDataType* out_dev_; @@ -278,18 +284,22 @@ struct DeviceReduceThreadWise : public DeviceReduceinStrides_[NumInvariantDim - 1] != 1) + return (false); - if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) - return (false); - - if(pArg->invariant_lowest_length % InSrcVectorSize != 0) - return (false); + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + }; } else { - if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + if(pArg->inStrides_[Rank - 1] != 1) return (false); if(pArg->reduce_lowest_length % InSrcVectorSize != 0) @@ -310,19 +320,19 @@ struct DeviceReduceThreadWise : public DeviceReduce - MakeArgumentPointer(const std::vector& inLengths, - const std::vector& inStrides, - const std::vector& outLengths, - const std::vector& outStrides, - const std::vector& reduceDims, + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector outLengths, + const std::vector outStrides, + const std::vector reduceDims, float alpha, float beta, const void* in_dev, void* out_dev, void* out_indices_dev, void* workspace_dev, - const InElementwiseOperation& in_elementwise_op, - const OutElementwiseOperation& acc_elementwise_op) override + const InElementwiseOperation in_elementwise_op, + const OutElementwiseOperation acc_elementwise_op) override { return std::make_unique(inLengths, inStrides, diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 2c45d1f544..fcc775e900 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -1,6 +1,5 @@ #ifndef CK_ELEMENT_WISE_OPERATION_HPP #define CK_ELEMENT_WISE_OPERATION_HPP -#include "data_type.hpp" #include "data_type.hpp" @@ -19,6 +18,8 @@ struct PassThrough __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; } __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; } + + __host__ __device__ void operator()(double& y, const double& x) const { y = x; } }; struct Add @@ -239,6 +240,24 @@ struct UnaryIdentic __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }; }; +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; + + __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; }; + + int32_t divider_ = 1; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }; +}; + template struct UnarySquare; @@ -311,6 +330,19 @@ struct UnaryAbs __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); }; }; +template <> +struct UnaryAbs +{ + __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(int8_t& y, const int8_t& x) const + { + int8_t sgn = x >> (8 - 1); + + y = (x ^ sgn) - sgn; + }; +}; + template struct UnarySqrt; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp index d68a217434..14fe0818a5 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp @@ -33,6 +33,7 @@ #include "reduction_functions_blockwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "cluster_descriptor.hpp" +#include "element_wise_operation.hpp" namespace ck { @@ -52,23 +53,25 @@ __global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k, const OutElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, const IndexDataType* const __restrict__ p_ws_indices_global, IndexDataType* const __restrict__ p_indices_global) { if constexpr(!NeedIndices) { - GridwiseReduction::Run(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); + constexpr bool IsSecondCall = false; + + GridwiseReduction::template Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); } else { @@ -102,23 +105,25 @@ kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k, const OutElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, const IndexDataType* const __restrict__ p_ws_indices_global, IndexDataType* const __restrict__ p_indices_global) { if constexpr(!NeedIndices) { - GridwiseReduction::Run(in_grid_desc_m_k, - out_grid_desc_m, - in_elementwise_op, - acc_elementwise_op, - alpha, - p_in_global, - beta, - p_out_global, - p_ws_indices_global, - p_indices_global); + constexpr bool IsSecondCall = true; + + GridwiseReduction::template Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); } else { @@ -156,6 +161,11 @@ template struct GridwiseReduction_mk_to_m_blockwise { + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); using ThreadClusterLengths_M_K = Sequence; @@ -174,8 +184,7 @@ struct GridwiseReduction_mk_to_m_blockwise static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); - template - using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -183,17 +192,24 @@ struct GridwiseReduction_mk_to_m_blockwise static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + template __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, const OutGridDesc_M& out_grid_desc_m, const InElementwiseOperation& in_elementwise_op, const OutElementwiseOperation& acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, const IndexDataType* const __restrict__ p_ws_indices_global, IndexDataType* const __restrict__ p_indices_global) { + if constexpr(IsSecondCall) + { + static_assert(InSrcVectorDim == 1, + "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"); + }; + using BlockwiseReduce = PartitionedBlockwiseReduction{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; }); }; }; @@ -355,7 +371,7 @@ struct GridwiseReduction_mk_to_m_blockwise OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -366,7 +382,7 @@ struct GridwiseReduction_mk_to_m_blockwise out_grid_desc_m, make_multi_index(block_global_1d_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_store.Run( reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); @@ -379,7 +395,7 @@ struct GridwiseReduction_mk_to_m_blockwise const OutElementwiseOperation& acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, const IndexDataType* const __restrict__ p_ws_indices_global, IndexDataType* const __restrict__ p_indices_global) @@ -570,7 +586,7 @@ struct GridwiseReduction_mk_to_m_blockwise priorDstValueBuf); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; }); }; }; @@ -580,7 +596,7 @@ struct GridwiseReduction_mk_to_m_blockwise OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -591,14 +607,14 @@ struct GridwiseReduction_mk_to_m_blockwise out_grid_desc_m, make_multi_index(block_global_1d_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); auto threadwise_dst_idx_store = ThreadwiseTensorSliceTransfer_v1r3, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -609,7 +625,7 @@ struct GridwiseReduction_mk_to_m_blockwise out_grid_desc_m, make_multi_index(block_global_1d_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_val_store.Run(reduced_data_desc, make_tuple(I0), @@ -631,11 +647,14 @@ struct GridwiseReduction_mk_to_m_blockwise const OutElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_ws_values_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, const IndexDataType* const __restrict__ p_ws_indices_global, IndexDataType* const __restrict__ p_indices_global) { + static_assert(InSrcVectorDim == 1, + "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"); + using BlockwiseReduceWithIndex = PartitionedBlockwiseReductionWithIndex{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + accu_value_buf(I) += type_convert(priorDstValueBuf[I]) * beta; }); }; }; @@ -851,7 +870,7 @@ struct GridwiseReduction_mk_to_m_blockwise OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -862,14 +881,14 @@ struct GridwiseReduction_mk_to_m_blockwise out_grid_desc_m, make_multi_index(block_global_1d_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); auto threadwise_dst_idx_store = ThreadwiseTensorSliceTransfer_v1r3, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -880,7 +899,7 @@ struct GridwiseReduction_mk_to_m_blockwise out_grid_desc_m, make_multi_index(block_global_1d_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_val_store.Run(reduced_data_desc, make_tuple(I0), diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp index 8527aee827..6a46135a33 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp @@ -32,6 +32,7 @@ #include "reduction_functions_blockwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" +#include "element_wise_operation.hpp" namespace ck { @@ -84,6 +85,11 @@ template struct GridwiseReduction_mk_to_m_multiblock_atomic_add { + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); using ThreadClusterLengths_M_K = Sequence; @@ -109,8 +115,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add ReduceOperation, PropagateNan>; - template - using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -249,7 +254,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add out_grid_desc_m, make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_store.Run( reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp index d47e4ed078..0c76794754 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp @@ -23,8 +23,8 @@ * SOFTWARE. * *******************************************************************************/ -#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP -#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP +#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP +#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP #include "reduction_common.hpp" #include "reduction_operator.hpp" @@ -32,6 +32,7 @@ #include "reduction_functions_blockwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "cluster_descriptor.hpp" +#include "element_wise_operation.hpp" namespace ck { @@ -101,6 +102,12 @@ template struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce { + static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!"); + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); using ThreadClusterLengths_M_K = Sequence; @@ -119,8 +126,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); - template - using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -238,9 +244,6 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce reducedTiles++; } while(reducedTiles < num_k_block_tile_iteration); - constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number<1>{})); - // Each block executes multiple parallel reductions on the LDS, and due to the using of // vector_load, each block/thread is involved into multiple invarirant dimensions. static_for<0, MThreadSliceSize, 1>{}([&](auto I) { @@ -254,6 +257,9 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I)); }); + constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + if(thread_k_cluster_id == 0) { auto threadwise_workspace_store = @@ -261,7 +267,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce AccDataType, decltype(reduced_data_desc), WorkspaceDesc_M_K, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0, 1>, 1, @@ -273,7 +279,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, block_local_id), - PassThroughOp{}); + PassThroughOp{}); threadwise_workspace_store.Run(reduced_data_desc, make_tuple(I0, I0), @@ -450,7 +456,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce AccDataType, decltype(reduced_data_desc), WorkspaceDesc_M_K, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0, 1>, 1, @@ -462,14 +468,14 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, block_local_id), - PassThroughOp{}); + PassThroughOp{}); auto threadwise_workspace_idx_store = ThreadwiseTensorSliceTransfer_v1r3, + PassThroughOp, Sequence, Sequence<0, 1>, 1, @@ -481,7 +487,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, block_local_id), - PassThroughOp{}); + PassThroughOp{}); threadwise_workspace_val_store.Run(reduced_data_desc, make_tuple(I0, I0), diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp index 3afa99c470..86caea2a92 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp @@ -31,6 +31,7 @@ #include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" #include "threadwise_tensor_slice_transfer.hpp" +#include "element_wise_operation.hpp" namespace ck { @@ -50,7 +51,7 @@ __global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, IndexDataType* const __restrict__ p_indices_global) { @@ -101,11 +102,15 @@ template struct GridwiseReduction_mk_to_m_threadwise { + static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || + (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && + (MThreadSliceSize % OutDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + using ThreadBufferDimAccessOrder = typename conditional, Sequence<0, 1>>::type; - template - using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; @@ -115,7 +120,7 @@ struct GridwiseReduction_mk_to_m_threadwise const AccElementwiseOperation& acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, IndexDataType* const __restrict__ p_indices_global) { @@ -228,7 +233,7 @@ struct GridwiseReduction_mk_to_m_threadwise priorDstValue_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValue_buf[I] * beta); + accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; }); }; }; @@ -238,7 +243,7 @@ struct GridwiseReduction_mk_to_m_threadwise OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -248,7 +253,7 @@ struct GridwiseReduction_mk_to_m_threadwise false>( out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_store.Run( reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf); @@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_threadwise const AccElementwiseOperation& acc_elementwise_op, AccDataType alpha, const InDataType* const __restrict__ p_in_global, - OutDataType beta, + AccDataType beta, OutDataType* const __restrict__ p_out_global, IndexDataType* const __restrict__ p_indices_global) { @@ -387,7 +392,7 @@ struct GridwiseReduction_mk_to_m_threadwise priorDstValue_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - accu_value_buf(I) += type_convert(priorDstValue_buf[I] * beta); + accu_value_buf(I) += type_convert(priorDstValue_buf[I]) * beta; }); }; }; @@ -397,7 +402,7 @@ struct GridwiseReduction_mk_to_m_threadwise OutDataType, decltype(reduced_data_desc), OutGridDesc_M, - PassThroughOp, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -407,14 +412,14 @@ struct GridwiseReduction_mk_to_m_threadwise false>( out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); auto threadwise_dst_idx_store = ThreadwiseTensorSliceTransfer_v1r3, + PassThroughOp, Sequence, Sequence<0>, 0, @@ -424,7 +429,7 @@ struct GridwiseReduction_mk_to_m_threadwise false>( out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize), - PassThroughOp{}); + PassThroughOp{}); threadwise_dst_val_store.Run( reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_val_buf); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 524da47e24..2ce64a9840 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -79,6 +79,8 @@ struct ThreadwiseTensorSliceTransfer_v1r3 { static_assert(SrcDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); + static_assert(SliceLengths::At(Number{}) % DstScalarPerVector == 0, + "wrong! Not divisible"); } __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) @@ -250,6 +252,8 @@ struct ThreadwiseTensorSliceTransfer_v2 { static_assert(DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); + static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, + "wrong! Not divisible"); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -313,7 +317,8 @@ struct ThreadwiseTensorSliceTransfer_v2 dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + i * src_scalar_step_in_vector); - dst_buf(Number{}) = src_vector.template AsType()[i]; + dst_buf(Number{}) = + type_convert(src_vector.template AsType()[i]); }); if constexpr(idx_1d.value != num_access - 1) @@ -439,6 +444,10 @@ struct ThreadwiseTensorSliceTransfer_v3 : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) { + static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, + "wrong! Not divisible"); + static_assert(SliceLengths::At(Number{}) % DstScalarPerVector == 0, + "wrong! Not divisible"); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -1016,7 +1025,8 @@ struct ThreadwiseTensorSliceTransfer_v4 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc and DstDesc need to known at compile-time"); - static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, "wrong!"); + static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, + "wrong! Not divisible"); } template ::type src } else if constexpr(N == 2) { - llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); + llvm_amdgcn_raw_buffer_store_i16x2(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); } else if constexpr(N == 4) { - llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); + llvm_amdgcn_raw_buffer_store_i16x4(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); } else if constexpr(N == 8) { diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index b35999d56f..c2adfc5063 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -606,6 +606,12 @@ struct sequence_map_inverse SeqMap::Size()>::type; }; +template +__host__ __device__ constexpr bool operator==(Sequence, Sequence) +{ + return ((Xs == Ys) && ...); +} + template __host__ __device__ constexpr auto operator+(Sequence, Sequence) { diff --git a/include/ck/utility/tensor_space_filling_curve.hpp b/include/ck/utility/tensor_space_filling_curve.hpp index c5cbe461f0..62b68559bf 100644 --- a/include/ck/utility/tensor_space_filling_curve.hpp +++ b/include/ck/utility/tensor_space_filling_curve.hpp @@ -37,6 +37,10 @@ struct SpaceFillingCurve __host__ __device__ static constexpr index_t GetNumOfAccess() { + static_assert(TensorLengths::Size() == ScalarsPerAccess::Size()); + static_assert(TensorLengths{} % ScalarsPerAccess{} == + typename uniform_sequence_gen::type{}); + return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) / ScalarPerVector; } diff --git a/library/include/ck/library/host_tensor/host_generic_reduction.hpp b/library/include/ck/library/host_tensor/host_generic_reduction.hpp deleted file mode 100644 index d10184aaf6..0000000000 --- a/library/include/ck/library/host_tensor/host_generic_reduction.hpp +++ /dev/null @@ -1,424 +0,0 @@ - -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef HOST_GENERIC_REDUCTION_HPP_ -#define HOST_GENERIC_REDUCTION_HPP_ - -#include -#include -#include -#include -#include -#include - -#include "reduction_enums.hpp" -#include "host_reduce_util.hpp" - -using float16 = half_float::half; - -namespace ck { - -namespace host_reduce { - -template -static void -get_all_indexes(const std::vector& dimLengths, int dim, std::vector>& indexes) -{ - if(dim < dimLengths.size()) - { - std::vector> updated_indexes; - - if(dim == 0) - { - assert(indexes.size() == 0); - assert(dimLengths[dim] > 0); - for(T i = 0; i < dimLengths[dim]; i++) - { - std::vector index = {i}; - - updated_indexes.push_back(index); - }; - } - else - { - // go through all the current indexes - for(const auto& index : indexes) - for(T i = 0; i < dimLengths[dim]; i++) - { - auto index_new = index; - index_new.push_back(i); - - updated_indexes.push_back(index_new); - }; - }; - - // update to the indexes (output) - indexes = updated_indexes; - - // further to construct the indexes from the updated status - get_all_indexes(dimLengths, dim + 1, indexes); - }; -}; - -template -static T get_offset_from_index(const std::vector& strides, const std::vector& index) -{ - T offset = 0; - - assert(strides.size() == index.size()); - - for(int i = 0; i < index.size(); i++) - offset += strides[i] * static_cast(index[i]); - - return (offset); -}; - -template -static inline T get_flatten_offset(const std::vector& lengths, const std::vector& index) -{ - T offset = 0; - - assert(lengths.size() == index.size() && lengths.size() > 0); - - int len = lengths.size(); - T stride = 1; - - // for len==1, the loop is not executed - for(int i = len - 1; i > 0; i--) - { - offset += stride * static_cast(index[i]); - - stride *= lengths[i]; - }; - - offset += stride * static_cast(index[0]); - - return (offset); -}; - -template -class ReductionHost -{ - public: - ReductionHost() = default; - ReductionHost(HostTensorDescriptor& inDesc, - HostTensorDescriptor& outDesc, - const std::vector& invariantDims_, - const std::vector& toReduceDims_) - { - this->inLengths = to_int_vector(inDesc.GetLengths()); - this->outLengths = to_int_vector(outDesc.GetLengths()); - this->inStrides = to_int_vector(inDesc.GetStrides()); - this->outStrides = to_int_vector(outDesc.GetStrides()); - - this->invariantDims = invariantDims_; - this->toReduceDims = toReduceDims_; - - assert(this->inLengths.size() == this->outLengths.size()); - assert(!this->toReduceDims.empty()); - - for(const auto dim : this->invariantDims) - this->invariantLengths.push_back(this->inLengths[dim]); - - for(const auto dim : this->toReduceDims) - toReduceLengths.push_back(this->inLengths[dim]); - - this->reduceAllDims = this->invariantDims.empty(); - }; - - ~ReductionHost(){}; - - void - Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices) - { - if constexpr(NeedIndices) - RunImpl_with_indices(alpha, in_data, beta, out_data, indices); - else - RunImpl_no_indices(alpha, in_data, beta, out_data); - }; - - private: - std::vector inLengths; - std::vector outLengths; - std::vector inStrides; - std::vector outStrides; - - std::vector invariantLengths; - std::vector toReduceLengths; - - std::vector invariantDims; - std::vector toReduceDims; - - bool reduceAllDims; - - void RunImpl_with_indices( - float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices) - { - using ck::host_reduce::binop_with_nan_check; - using ck::host_reduce::binop_with_nan_check2; - using ck::host_reduce::float_equal_one; - using ck::host_reduce::float_equal_zero; - using ck::host_reduce::PosUnaryOpFn; - using ck::host_reduce::PreUnaryOpFn; - using ck::host_reduce::ReduceOpFn2; - using ck::host_reduce::ReduceOpZeroVal; - - auto opReduce = ReduceOpFn2(); - - int divider = 1; - for(int i = 0; i < toReduceLengths.size(); i++) - divider *= toReduceLengths[i]; - - auto PreUnaryOp = PreUnaryOpFn(divider); - auto PosUnaryOp = PosUnaryOpFn(divider); - - if(reduceAllDims) - { - std::vector> indexes_1; - - get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space - - auto accuVal = ReduceOpZeroVal(); - int accuIndex = 0; - - // go through indexes of the invariant dimensions - for(const auto& src_index : indexes_1) - { - auto src_offset = get_offset_from_index(this->inStrides, src_index); - - auto currVal = static_cast(in_data[src_offset]); - - // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually - // done - PreUnaryOp(currVal); - - auto currIndex = get_flatten_offset(inLengths, src_index); - binop_with_nan_check2( - opReduce, accuVal, currVal, accuIndex, currIndex); - }; - - // scale the accumulated value - if(!float_equal_one(alpha)) - accuVal *= static_cast(alpha); - - // scale the prior dst value and add it to the accumulated value - if(!float_equal_zero(beta)) - accuVal += static_cast(out_data[0]) * static_cast(beta); - - // store the reduced value to dst location - out_data[0] = static_cast(accuVal); - indices[0] = accuIndex; - } - else - { - std::vector> indexes_1, indexes_2; - - get_all_indexes( - this->invariantLengths, 0, indexes_1); // generate the invariant indexes space - get_all_indexes( - this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space - - // go through indexes of the invariant dimensions - for(const auto& index_1 : indexes_1) - { - std::vector src_index; - std::vector dst_index; - - src_index.resize(this->inLengths.size()); - - // generate the part of src index belonging to invariant dims - for(int k = 0; k < invariantDims.size(); k++) - src_index[invariantDims[k]] = index_1[k]; - - for(int k = 0; k < invariantDims.size(); k++) - dst_index.push_back(index_1[k]); - - int dst_offset = get_offset_from_index(this->outStrides, dst_index); - - AccDataType accuVal = ReduceOpZeroVal(); - int accuIndex = 0; - - // go through indexes of the toReduce dimensions - for(const auto& index_2 : indexes_2) - { - // generate the part of src index belonging to toReduce dims - for(int k = 0; k < toReduceDims.size(); k++) - src_index[toReduceDims[k]] = index_2[k]; - - auto src_offset = get_offset_from_index(this->inStrides, src_index); - - auto currVal = static_cast(in_data[src_offset]); - // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is - // actually done - PreUnaryOp(currVal); - - auto currIndex = get_flatten_offset(toReduceLengths, index_2); - binop_with_nan_check2( - opReduce, accuVal, currVal, accuIndex, currIndex); - }; - - // scale the accumulated value - if(!float_equal_one(alpha)) - accuVal *= static_cast(alpha); - - // scale the prior dst value and add it to the accumulated value - if(!float_equal_zero(beta)) - accuVal += static_cast(out_data[dst_offset]) * - static_cast(beta); - - // store the reduced value to dst location - out_data[dst_offset] = static_cast(accuVal); - indices[dst_offset] = accuIndex; - }; - }; - }; // end of RunImpl_with_indices() - - void - RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data) - { - using ck::host_reduce::binop_with_nan_check; - using ck::host_reduce::binop_with_nan_check2; - using ck::host_reduce::float_equal_one; - using ck::host_reduce::float_equal_zero; - using ck::host_reduce::PosUnaryOpFn; - using ck::host_reduce::PreUnaryOpFn; - using ck::host_reduce::ReduceOpFn; - using ck::host_reduce::ReduceOpZeroVal; - - auto opReduce = ReduceOpFn(); - - int divider = 1; - for(int i = 0; i < toReduceLengths.size(); i++) - divider *= toReduceLengths[i]; - - auto PreUnaryOp = PreUnaryOpFn(divider); - auto PosUnaryOp = PosUnaryOpFn(divider); - - if(reduceAllDims) - { - std::vector> indexes_1; - - get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space - - auto accuVal = ReduceOpZeroVal(); - - // go through indexes of the invariant dimensions - for(const auto& src_index : indexes_1) - { - auto src_offset = get_offset_from_index(this->inStrides, src_index); - - auto currVal = static_cast(in_data[src_offset]); - - PreUnaryOp(currVal); - - binop_with_nan_check(opReduce, accuVal, currVal); - }; - - PosUnaryOp(accuVal); - - // scale the accumulated value - if(!float_equal_one(alpha)) - accuVal *= static_cast(alpha); - - // scale the prior dst value and add it to the accumulated value - if(!float_equal_zero(beta)) - accuVal += static_cast(out_data[0]) * static_cast(beta); - - // store the reduced value to dst location - out_data[0] = static_cast(accuVal); - } - else - { - std::vector> indexes_1, indexes_2; - - get_all_indexes( - this->invariantLengths, 0, indexes_1); // generate the invariant indexes space - get_all_indexes( - this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space - - // go through indexes of the invariant dimensions - for(const auto& index_1 : indexes_1) - { - std::vector src_index; - std::vector dst_index; - - src_index.resize(this->inLengths.size()); - - for(int k = 0; k < invariantDims.size(); k++) - dst_index.push_back(index_1[k]); - - int dst_offset = get_offset_from_index(this->outStrides, dst_index); - - // generate the part of src index belonging to invariant dims - for(int k = 0; k < invariantDims.size(); k++) - src_index[invariantDims[k]] = index_1[k]; - - AccDataType accuVal = ReduceOpZeroVal(); - - // go through indexes of the toReduce dimensions - for(const auto& index_2 : indexes_2) - { - // generate the part of src index belonging to toReduce dims - for(int k = 0; k < toReduceDims.size(); k++) - src_index[toReduceDims[k]] = index_2[k]; - - auto src_offset = get_offset_from_index(this->inStrides, src_index); - - auto currVal = static_cast(in_data[src_offset]); - - PreUnaryOp(currVal); - - binop_with_nan_check(opReduce, accuVal, currVal); - }; - - PosUnaryOp(accuVal); - - // scale the accumulated value - if(!float_equal_one(alpha)) - accuVal *= static_cast(alpha); - - // scale the prior dst value and add it to the accumulated value - if(!float_equal_zero(beta)) - accuVal += static_cast(out_data[dst_offset]) * - static_cast(beta); - - // store the reduced value to dst location - out_data[dst_offset] = static_cast(accuVal); - }; - }; - }; // end of RunImpl_no_indices() -}; - -}; // end of namespace host_reduce - -}; // end of namespace ck - -#endif diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp index a176962bb1..f5e01ccc94 100644 --- a/library/include/ck/library/host_tensor/host_reduce_util.hpp +++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp @@ -66,22 +66,22 @@ static inline bool float_equal_zero(half_float::half x) return x == static_cast(0.0f); }; -template -__host__ static inline std::function PreUnaryOpFn(int) +template +__host__ static inline std::function PreUnaryOpFn(int) { using std::abs; if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1) { - return ([&](compType& a_) { a_ = abs(a_); }); + return ([&](AccDataType& a_) { a_ = abs(a_); }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) { - return ([&](compType& a_) { a_ = a_ * a_; }); + return ([&](AccDataType& a_) { a_ = a_ * a_; }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) { - return ([&](compType& a_) { a_ = abs(a_); }); + return ([&](AccDataType& a_) { a_ = abs(a_); }); } else { @@ -90,23 +90,23 @@ __host__ static inline std::function PreUnaryOpFn(int) // ReduceTensorOp_t::MUL: // ReduceTensorOp_t::MIN: // ReduceTensorOp_t::MAX: - return ([&](compType&) {}); + return ([&](AccDataType&) {}); }; }; -template -__host__ static inline std::function PosUnaryOpFn(int divider) +template +__host__ static inline std::function PosUnaryOpFn(int32_t divider) { using std::sqrt; if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) { - return ([&](compType& a_) { a_ = sqrt(a_); }); + return ([&](AccDataType& a_) { a_ = sqrt(a_); }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG) { - return ([&, divider](compType& a_) { - a_ = a_ / static_cast(static_cast(divider)); + return ([&, divider](AccDataType& a_) { + a_ = a_ / static_cast(static_cast(divider)); }); } else @@ -117,44 +117,44 @@ __host__ static inline std::function PosUnaryOpFn(int divider) // ReduceTensorOp_t::MIN: // ReduceTensorOp_t::MAX: // ReduceTensorOp_t::AMAX: - return ([&](compType&) {}); + return ([&](AccDataType&) {}); } }; -template -__host__ static inline std::function ReduceOpFn() +template +__host__ static inline std::function ReduceOpFn() { if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2) { - return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); + return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) { - return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); + return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) { - return ([&](compType& a_, compType b_) { + return ([&](AccDataType& a_, AccDataType b_) { if(a_ > b_) a_ = b_; }); } else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) { - return ([&](compType& a_, compType b_) { + return ([&](AccDataType& a_, AccDataType b_) { if(a_ < b_) a_ = b_; }); } }; -template -__host__ static inline std::function ReduceOpFn2() +template +__host__ static inline std::function ReduceOpFn2() { if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) { - return ([&](compType& a_, compType b_, bool& changed) { + return ([&](AccDataType& a_, AccDataType b_, bool& changed) { if(a_ > b_) { a_ = b_; @@ -166,7 +166,7 @@ __host__ static inline std::function R } else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) { - return ([&](compType& a_, compType b_, bool& changed) { + return ([&](AccDataType& a_, AccDataType b_, bool& changed) { if(a_ < b_) { a_ = b_; @@ -183,28 +183,28 @@ __host__ static inline std::function R // ReduceTensorOp_t::AVG: // ReduceTensorOp_t::NORM1: // ReduceTensorOp_t::NORM2: - return (std::function{}); + return (std::function{}); }; }; -template -__host__ static inline compType ReduceOpZeroVal() +template +__host__ static inline AccDataType ReduceOpZeroVal() { if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) { - return (static_cast(1.0f)); + return (static_cast(1.0f)); } else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) { - return (std::numeric_limits::max()); + return (std::numeric_limits::max()); } else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX) { - return (std::numeric_limits::lowest()); + return (std::numeric_limits::lowest()); } else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) { - return (static_cast(0.0f)); + return (static_cast(0.0f)); } else { @@ -212,14 +212,15 @@ __host__ static inline compType ReduceOpZeroVal() // ReduceTensorOp_t::AVG // ReduceTensorOp_t::NORM1 // ReduceTensorOp_t::NORM2 - return (static_cast(0.0f)); + return (static_cast(0.0f)); }; }; -template -__host__ static inline void binop_with_nan_check(std::function opReduce, - compType& accuVal, - compType currVal) +template +__host__ static inline void +binop_with_nan_check(std::function opReduce, + AccDataType& accuVal, + AccDataType currVal) { using std::isnan; @@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function +template __host__ static inline void -binop_with_nan_check2(std::function opReduce, - compType& accuVal, - compType currVal, +binop_with_nan_check2(std::function opReduce, + AccDataType& accuVal, + AccDataType currVal, int& accuIndex, int currIndex) { diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp new file mode 100644 index 0000000000..fe9fba6121 --- /dev/null +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -0,0 +1,402 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HOST_REDUCTION_HPP_ +#define HOST_REDUCTION_HPP_ + +#include +#include +#include + +#include "reduction_enums.hpp" +#include "host_reduce_util.hpp" +#include "host_tensor.hpp" +#include "data_type.hpp" + +template +static void get_all_indexes(const std::array& dimLengths, + std::vector>& indexes) +{ + static_assert(NDim >= 1, "NDim >= 1 is required to use this function!"); + + if constexpr(NDim == 1) + { + for(size_t i = 0; i < dimLengths[0]; i++) + { + std::array index{i}; + + indexes.push_back(index); + }; + } + else + { + std::array partial_dim_lengths; + + for(int i = 0; i < NDim - 1; i++) + partial_dim_lengths[i] = dimLengths[i + 1]; + + std::vector> partial_indexes; + + get_all_indexes(partial_dim_lengths, partial_indexes); + + for(size_t i = 0; i < dimLengths[0]; i++) + for(const auto& index : partial_indexes) + { + std::array extIndex; + + extIndex[0] = i; + + for(int k = 0; k < NDim - 1; k++) + extIndex[k + 1] = index[k]; + + indexes.push_back(extIndex); + }; + }; +}; + +template +static size_t get_offset_from_index(const std::array& strides, + const std::array& index) +{ + size_t offset = 0; + + for(int i = 0; i < NDim; i++) + offset += strides[i] * index[i]; + + return (offset); +}; + +template +static size_t get_offset_from_index(const std::vector& strides, + const std::array& index) +{ + size_t offset = 0; + + for(int i = 0; i < NDim; i++) + offset += strides[i] * index[i]; + + return (offset); +}; + +template +struct ReductionHost +{ + using IndexDataType = int32_t; + + static constexpr int NumInvariantDim = Rank - NumReduceDim; + + std::vector outStrides; + std::vector invariantDims; + std::vector reduceDims; + + IndexDataType divider; + std::function preUnaryOp; + std::function posUnaryOp; + std::array reduceLengths; + std::array reduceStrides; + std::array invariantLengths; + std::array invariantStrides; + + std::vector> reduce_dim_indexes; + std::vector> invariant_dim_indexes; + + ReductionHost(HostTensorDescriptor& inDesc, + HostTensorDescriptor& outDesc, + const std::vector& invariantDims_, + const std::vector& reduceDims_) + { + using ck::host_reduce::PosUnaryOpFn; + using ck::host_reduce::PreUnaryOpFn; + + // this->outLengths = to_int_vector(outDesc.GetLengths()); + this->outStrides = outDesc.GetStrides(); + + this->invariantDims = invariantDims_; + this->reduceDims = reduceDims_; + + int product = 1; + + for(int i = 0; i < NumReduceDim; i++) + { + reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]]; + reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]]; + product *= inDesc.GetLengths()[reduceDims[i]]; + }; + + divider = product; + + for(int i = 0; i < NumInvariantDim; i++) + { + invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]]; + invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]]; + }; + + reduce_dim_indexes.clear(); + get_all_indexes(reduceLengths, reduce_dim_indexes); + + if constexpr(NumInvariantDim > 0) + { + invariant_dim_indexes.clear(); + get_all_indexes(invariantLengths, invariant_dim_indexes); + }; + + preUnaryOp = PreUnaryOpFn(divider); + posUnaryOp = PosUnaryOpFn(divider); + }; + + void Run(float alpha, + const InDataType* in_data, + float beta, + OutDataType* out_data, + IndexDataType* out_indices) + { + if constexpr(NeedIndices) + { + RunImpl_with_index(alpha, in_data, beta, out_data, out_indices); + } + else + { + RunImpl_no_index(alpha, in_data, beta, out_data); + }; + }; + + void RunImpl_with_index(float alpha, + const InDataType* in_data, + float beta, + OutDataType* out_data, + IndexDataType* out_indices) + { + using ck::type_convert; + using ck::host_reduce::binop_with_nan_check2; + using ck::host_reduce::float_equal_one; + using ck::host_reduce::float_equal_zero; + using ck::host_reduce::ReduceOpFn2; + using ck::host_reduce::ReduceOpZeroVal; + + auto opReduce2 = ReduceOpFn2(); + + if constexpr(NumInvariantDim == 0) + { + AccDataType accuVal = ReduceOpZeroVal(); + IndexDataType accuIndex = 0; + + for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) + { + auto offset_reduce = + get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); + + auto currVal = type_convert(in_data[offset_reduce]); + + preUnaryOp(currVal); + + auto currIndex = i; + + binop_with_nan_check2( + opReduce2, accuVal, currVal, accuIndex, currIndex); + }; + + posUnaryOp(accuVal); + + if(!float_equal_one(alpha)) + accuVal *= type_convert(alpha); + + if(!float_equal_zero(beta)) + accuVal += type_convert(out_data[0]) * type_convert(beta); + + out_data[0] = type_convert(accuVal); + out_indices[0] = accuIndex; + } + else + { + auto thread_reduce_func = [&](auto invariant_index) { + AccDataType accuVal = ReduceOpZeroVal(); + IndexDataType accuIndex = 0; + + auto offset_invariant = + get_offset_from_index(invariantStrides, invariant_index); + + for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) + { + auto offset_reduce = + get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); + + auto currVal = + type_convert(in_data[offset_invariant + offset_reduce]); + + preUnaryOp(currVal); + + auto currIndex = i; + + binop_with_nan_check2( + opReduce2, accuVal, currVal, accuIndex, currIndex); + }; + + posUnaryOp(accuVal); + + if(!float_equal_one(alpha)) + accuVal *= type_convert(alpha); + + auto dst_offset = + get_offset_from_index(outStrides, invariant_index); + + if(!float_equal_zero(beta)) + accuVal += type_convert(out_data[dst_offset]) * + type_convert(beta); + + out_data[dst_offset] = type_convert(accuVal); + out_indices[dst_offset] = accuIndex; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = + (invariant_dim_indexes.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = + std::min((it + 1) * work_per_thread, invariant_dim_indexes.size()); + + auto f = [=] { + for(std::size_t iw = iw_begin; iw < iw_end; ++iw) + { + thread_reduce_func(invariant_dim_indexes[iw]); + } + }; + + threads[it] = joinable_thread(f); + } + }; + }; + + void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data) + { + using ck::type_convert; + using ck::host_reduce::binop_with_nan_check; + using ck::host_reduce::float_equal_one; + using ck::host_reduce::float_equal_zero; + using ck::host_reduce::ReduceOpFn; + using ck::host_reduce::ReduceOpZeroVal; + + auto opReduce = ReduceOpFn(); + + if constexpr(NumInvariantDim == 0) + { + AccDataType accuVal = ReduceOpZeroVal(); + + for(const auto& reduce_index : reduce_dim_indexes) + { + auto offset_reduce = + get_offset_from_index(reduceStrides, reduce_index); + + auto currVal = type_convert(in_data[offset_reduce]); + + preUnaryOp(currVal); + + binop_with_nan_check(opReduce, accuVal, currVal); + }; + + posUnaryOp(accuVal); + + if(!float_equal_one(alpha)) + accuVal *= type_convert(alpha); + + if(!float_equal_zero(beta)) + accuVal += type_convert(out_data[0]) * type_convert(beta); + + out_data[0] = type_convert(accuVal); + } + else + { + auto thread_reduce_func = [&](auto invariant_index) { + AccDataType accuVal = ReduceOpZeroVal(); + + auto offset_invariant = + get_offset_from_index(invariantStrides, invariant_index); + + for(const auto& reduce_index : reduce_dim_indexes) + { + auto offset_reduce = + get_offset_from_index(reduceStrides, reduce_index); + + auto currVal = + type_convert(in_data[offset_invariant + offset_reduce]); + + preUnaryOp(currVal); + + binop_with_nan_check(opReduce, accuVal, currVal); + }; + + posUnaryOp(accuVal); + + if(!float_equal_one(alpha)) + accuVal *= type_convert(alpha); + + auto dst_offset = + get_offset_from_index(outStrides, invariant_index); + + if(!float_equal_zero(beta)) + accuVal += type_convert(out_data[dst_offset]) * + type_convert(beta); + + out_data[dst_offset] = type_convert(accuVal); + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = + (invariant_dim_indexes.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = + std::min((it + 1) * work_per_thread, invariant_dim_indexes.size()); + + auto f = [=] { + for(std::size_t iw = iw_begin; iw < iw_end; ++iw) + { + thread_reduce_func(invariant_dim_indexes[iw]); + } + }; + + threads[it] = joinable_thread(f); + } + }; + }; +}; + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp index 6fd30b7cb6..fafbe120b9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp @@ -6,23 +6,36 @@ #include "device_reduce_instance_blockwise_f32_f32_f32.hpp" #include "device_reduce_instance_blockwise_f32_f64_f32.hpp" #include "device_reduce_instance_blockwise_f64_f64_f64.hpp" +#include "device_reduce_instance_blockwise_i8_i8_i8.hpp" +#include "device_reduce_instance_blockwise_i8_i32_i8.hpp" +#include "device_reduce_instance_blockwise_b16_f32_b16.hpp" #include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp" #include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp" #include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp" #include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp" #include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp" +#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp" +#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp" +#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp" #include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" +#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp" #include "device_reduce_instance_threadwise_f16_f16_f16.hpp" #include "device_reduce_instance_threadwise_f16_f32_f16.hpp" #include "device_reduce_instance_threadwise_f32_f32_f32.hpp" #include "device_reduce_instance_threadwise_f32_f64_f32.hpp" #include "device_reduce_instance_threadwise_f64_f64_f64.hpp" +#include "device_reduce_instance_threadwise_i8_i8_i8.hpp" +#include "device_reduce_instance_threadwise_i8_i32_i8.hpp" +#include "device_reduce_instance_threadwise_b16_f32_b16.hpp" #endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index b71707294c..64d89e41b0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -17,7 +17,6 @@ using reduce_configuration_2_instances_blockwise = std::tuple< ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<0, 1, 1, 2, 1>, ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<1, 2, 2, 1, 2>, ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<1, 1, 1, 1, 3> // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp new file mode 100644 index 0000000000..0ae3289a0d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp @@ -0,0 +1,60 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp index 42b2482085..e7bdb15d92 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp @@ -13,21 +13,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp index fdf2f8b587..dad0d86350 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp index 877b687d24..34ec15db2b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp index 48f3ab567f..b08f35ad09 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp index d88bd341a2..65cdd45340 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp new file mode 100644 index 0000000000..8d222d53dc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp @@ -0,0 +1,31 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp new file mode 100644 index 0000000000..7f67138e6b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp @@ -0,0 +1,47 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp index 6ffe22ec0c..5a0c18e7a3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp @@ -15,9 +15,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple< // clang-format off // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<1, 2, 2, 1, 2>, - ReductionConfiguration_2<1, 1, 1, 1, 3>, - ReductionConfiguration_2<1, 1, 2, 1, 3> + ReductionConfiguration_2<1, 1, 1, 1, 3> // clang-format on >; #else diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp index bf78feb552..4ce19c7d0c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp @@ -13,21 +13,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp new file mode 100644 index 0000000000..c85419befc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp @@ -0,0 +1,60 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp index 3e880b6929..d42e7e020f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp index 01b1a3103a..fcf244d1d3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp index 46908a4c56..72e806ee60 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp index 2182c2eac2..476c3a7d8f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp new file mode 100644 index 0000000000..d46780483b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp @@ -0,0 +1,31 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp new file mode 100644 index 0000000000..7b020fb439 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp @@ -0,0 +1,47 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index d3f62e4050..3b317e1d80 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -17,7 +17,6 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<0, 1, 1, 2, 1>, ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<1, 2, 2, 1, 2>, ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<1, 1, 1, 1, 3> // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp new file mode 100644 index 0000000000..58f90bb94f --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp @@ -0,0 +1,31 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp index f1c53b9bce..f4c766ca03 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp @@ -13,9 +13,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp index 07258be297..c2f2564fc9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp @@ -13,9 +13,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp index 7cd5bc778e..830dcf9407 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp @@ -13,9 +13,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp new file mode 100644 index 0000000000..d25645ad1e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp @@ -0,0 +1,60 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp index d58acf14ca..05549fc702 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp @@ -13,21 +13,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp index 54c5b853b1..3e4aaef51b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp index f7f476abc1..2a1e4e7bf0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp @@ -13,25 +13,32 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp index 86455fd913..f95e3001ee 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp @@ -13,6 +13,7 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp index 55b69257b6..fac65128b6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp @@ -13,33 +13,42 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); // Will be moved to use MultiBlockAtomicAdd ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp new file mode 100644 index 0000000000..895c144c66 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp @@ -0,0 +1,31 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp new file mode 100644 index 0000000000..d6bee57fcd --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp @@ -0,0 +1,47 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index 3321791207..9371672a54 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -17,7 +17,6 @@ using reduce_configuration_2_instances_threadwise = std::tuple< ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<0, 1, 1, 2, 1>, ReductionConfiguration_2<1, 2, 1, 1, 2>, - ReductionConfiguration_2<1, 2, 2, 1, 2>, ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<1, 1, 1, 1, 3> // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp new file mode 100644 index 0000000000..f11d9118c9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp @@ -0,0 +1,60 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp index 5d8a037cb4..fe220335c5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp @@ -13,21 +13,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp index 8a50074054..970559cfac 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp index 2ad2535523..66c33a72a4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp index 2dca1e40df..196f142dbf 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp @@ -13,12 +13,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp index 8fcfaa38f8..4f3e1448d0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp @@ -13,30 +13,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp new file mode 100644 index 0000000000..8f19a5d0a2 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp @@ -0,0 +1,31 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp new file mode 100644 index 0000000000..83bd48cd3f --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp @@ -0,0 +1,47 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index c64d8b1361..cced3a4b76 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -5,24 +5,37 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE device_reduce_instance_blockwise_f32_f32_f32.cpp; device_reduce_instance_blockwise_f32_f64_f32.cpp; device_reduce_instance_blockwise_f64_f64_f64.cpp; + device_reduce_instance_blockwise_i8_i32_i8.cpp; + device_reduce_instance_blockwise_i8_i8_i8.cpp; + device_reduce_instance_blockwise_b16_f32_b16.cpp; device_reduce_instance_threadwise_f16_f16_f16.cpp; device_reduce_instance_threadwise_f16_f32_f16.cpp; device_reduce_instance_threadwise_f32_f32_f32.cpp; device_reduce_instance_threadwise_f32_f64_f32.cpp; device_reduce_instance_threadwise_f64_f64_f64.cpp; + device_reduce_instance_threadwise_i8_i32_i8.cpp; + device_reduce_instance_threadwise_i8_i8_i8.cpp; + device_reduce_instance_threadwise_b16_f32_b16.cpp; device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp; device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp; device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp; device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp; device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp; + device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp; + device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp; + device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp; device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp; device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp; device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp; + device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp; device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp; device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp; device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp; device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp; device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp; + device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp; + device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp; + device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp; ) add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp new file mode 100644 index 0000000000..0274d89fc9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp @@ -0,0 +1,53 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp index aa7c69e362..8a43d860ea 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp @@ -8,21 +8,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp index 5a8e5eb625..3e0b8ba59c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp index cfe7cd86e9..ee96311f8c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp index 453a2c6437..b0ae95e82d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp index 0499bd3987..9cca2dbbeb 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp new file mode 100644 index 0000000000..05cd1921ee --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp @@ -0,0 +1,24 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp new file mode 100644 index 0000000000..66ef017864 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp @@ -0,0 +1,40 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp index dd5514daca..82a9c11413 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp @@ -8,21 +8,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp new file mode 100644 index 0000000000..6b8139c32c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp @@ -0,0 +1,53 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp index 295b31f629..267b9d4d9d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp index 08b1592eab..0036a89542 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp index ba46891d0e..0512fa4158 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp index 3a8ddadb2e..afe7f0752e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp new file mode 100644 index 0000000000..9cb3b8684f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp @@ -0,0 +1,24 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp new file mode 100644 index 0000000000..8783a75486 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp @@ -0,0 +1,40 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp new file mode 100644 index 0000000000..9b2b7f5d8c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp @@ -0,0 +1,24 @@ +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp index 847a3b6ac9..fc956aa04b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp @@ -8,9 +8,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp index 77fe2d8a05..e5ffd9f976 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp @@ -8,9 +8,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp index a748dc263c..229829b889 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp @@ -8,9 +8,11 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp new file mode 100644 index 0000000000..d740fcfa8f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp @@ -0,0 +1,53 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp index 527ebc5386..f57ed5ad86 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp @@ -8,21 +8,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp index ace76f4675..724b364104 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp index 767dca99bd..15028a0b4c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp @@ -8,25 +8,32 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp index 2ed21e74e8..ec0ba3cf8e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp @@ -8,6 +8,7 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp index 95bd1daa8f..9ff2dcd93b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp @@ -8,33 +8,42 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // Will be moved to use MultiBlockAtomicAdd ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp new file mode 100644 index 0000000000..0e37c2947f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp @@ -0,0 +1,24 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp new file mode 100644 index 0000000000..4634faed06 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp @@ -0,0 +1,40 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp new file mode 100644 index 0000000000..02fc4b4c01 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp @@ -0,0 +1,53 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); + +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp index 70b667e7d2..0984cdc46b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp @@ -8,21 +8,27 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp index 6b81513c27..64f14bd4e7 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp index 27076415e6..69ed303b17 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp index 52c84a4278..5d791cec41 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp @@ -8,12 +8,15 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp index f77122d5a0..16c0409134 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp @@ -8,30 +8,39 @@ namespace device_reduce_instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 +ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp new file mode 100644 index 0000000000..7af7bc03f2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); +// clang-format on +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp new file mode 100644 index 0000000000..9580aae057 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp @@ -0,0 +1,40 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); +ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 8ed93b94eb..c03f955ad3 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -2,7 +2,7 @@ #include "device_reduce.hpp" #include "device_reduce_instance.hpp" #include "reduction_enums.hpp" -#include "host_generic_reduction.hpp" +#include "host_reduction.hpp" namespace ck { namespace tensor_operation { @@ -20,34 +20,43 @@ struct ReduceDescription }; using reduce_description_instances = std::tuple, // for ADD + ReduceDescription<4, 4, 0, 0, 0>, ReduceDescription<4, 1, 0, 0, 0>, ReduceDescription<2, 1, 0, 0, 0>, ReduceDescription<4, 3, 5, 0, 0>, // for AVG + ReduceDescription<4, 4, 5, 0, 0>, ReduceDescription<4, 1, 5, 0, 0>, ReduceDescription<2, 1, 5, 0, 0>, ReduceDescription<4, 3, 7, 0, 0>, // for NORM2 + ReduceDescription<4, 4, 7, 0, 0>, ReduceDescription<4, 1, 7, 0, 0>, ReduceDescription<2, 1, 7, 0, 0>, ReduceDescription<4, 3, 2, 0, 0>, // for MIN + ReduceDescription<4, 4, 2, 0, 0>, ReduceDescription<4, 1, 2, 0, 0>, ReduceDescription<2, 1, 2, 0, 0>, ReduceDescription<4, 3, 3, 0, 0>, // for MAX + ReduceDescription<4, 4, 3, 0, 0>, ReduceDescription<4, 1, 3, 0, 0>, ReduceDescription<2, 1, 3, 0, 0>, ReduceDescription<4, 3, 4, 0, 0>, // for AMAX + ReduceDescription<4, 4, 4, 0, 0>, ReduceDescription<4, 1, 4, 0, 0>, ReduceDescription<2, 1, 4, 0, 0>, ReduceDescription<4, 3, 2, 0, 1>, // for MIN + ReduceDescription<4, 4, 2, 0, 1>, ReduceDescription<4, 1, 2, 0, 1>, ReduceDescription<2, 1, 2, 0, 1>, ReduceDescription<4, 3, 3, 0, 1>, // for MAX + ReduceDescription<4, 4, 3, 0, 1>, ReduceDescription<4, 1, 3, 0, 1>, ReduceDescription<2, 1, 3, 0, 1>, ReduceDescription<4, 3, 4, 0, 1>, // for AMAX + ReduceDescription<4, 4, 4, 0, 1>, ReduceDescription<4, 1, 4, 0, 1>, ReduceDescription<2, 1, 4, 0, 1>>; @@ -122,16 +131,16 @@ static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) }; // map the data type used by the GPU kernels to the corresponding type used by the host codes -template +template struct type_mapping { - using outDataType = inDataType; + using OutType = InType; }; template <> struct type_mapping { - using outDataType = half_float::half; + using OutType = half_float::half; }; template ::value && + ((!op_support_indices && !std::is_same::value) || + (op_support_indices && !std::is_same::value)); + + // 1) If InDataType is int8_t, the supported operation must be either indexable operations or + // ADD/AVG + constexpr bool invalid_reduce_5 = std::is_same::value && + (!op_support_indices && ReduceOpId != ReduceTensorOp_t::ADD && + ReduceOpId != ReduceTensorOp_t::AVG); + + // 1) If InDataType is bhalf_t, must use float as AccDataType for all reduction operations + constexpr bool invalid_reduce_6 = + std::is_same::value && !std::is_same::value; + + constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || + invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6); if constexpr(!invalid_reduce) { @@ -205,8 +233,8 @@ void profile_reduce_impl_impl(bool do_verification, Tensor out_ref(outLengths); Tensor out(outLengths); - Tensor out_indices_ref(outLengths); - Tensor out_indices(outLengths); + Tensor out_indices_ref(outLengths); + Tensor out_indices(outLengths); auto inStrides = in.mDesc.GetStrides(); auto outStrides = out.mDesc.GetStrides(); @@ -220,20 +248,22 @@ void profile_reduce_impl_impl(bool do_verification, { switch(init_method) { - case 0: - in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; + case 0: break; case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); if(beta != 0.0f) out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); break; default: - in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); if(beta != 0.0f) - out_ref.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, + num_thread); } if(beta != 0.0f) @@ -306,6 +336,7 @@ void profile_reduce_impl_impl(bool do_verification, IndicesOpt>(reduce0_ptrs); if constexpr(use_atomic_add) + { add_device_reduce_instance_multiblock_atomic_add(reduce0_ptrs); + } else + { add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); + }; // used for secondary reduction if constexpr(!use_atomic_add) + { add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); + }; if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) { @@ -342,17 +378,24 @@ void profile_reduce_impl_impl(bool do_verification, if(do_verification) { - using hInType = typename type_mapping::outDataType; - using hOutType = typename type_mapping::outDataType; - using hCompType = typename type_mapping::outDataType; + using HostInDataType = typename type_mapping::OutType; + using HostOutDataType = typename type_mapping::OutType; + using HostAccDataType = typename type_mapping::OutType; - ReductionHost + ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce.Run(alpha, - reinterpret_cast(in.mData.data()), + reinterpret_cast(in.mData.data()), beta, - reinterpret_cast(out_ref.mData.data()), + reinterpret_cast(out_ref.mData.data()), out_indices_ref.mData.data()); }; @@ -363,24 +406,27 @@ void profile_reduce_impl_impl(bool do_verification, for(auto& reduce_ptr : reduce0_ptrs) { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths); + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); DeviceMem ws_dev(wsSizeInBytes); - auto argument_ptr = reduce_ptr->MakeArgumentPointer( - i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation_0{static_cast(reduce_total_length)}, - AccElementwiseOperation_0{static_cast(reduce_total_length)}); + InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); + AccElementwiseOperation_0 acc_elementwise_op_0( + static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_0, + acc_elementwise_op_0); if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) continue; @@ -445,24 +491,27 @@ void profile_reduce_impl_impl(bool do_verification, for(auto& reduce_ptr : reduce1_ptrs) { - auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths); + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); DeviceMem ws_dev(wsSizeInBytes); - auto argument_ptr = reduce_ptr->MakeArgumentPointer( - i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation_1{static_cast(reduce_total_length)}, - AccElementwiseOperation_1{static_cast(reduce_total_length)}); + InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); + AccElementwiseOperation_1 acc_elementwise_op_1( + static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_1, + acc_elementwise_op_1); if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) continue; @@ -482,20 +531,25 @@ void profile_reduce_impl_impl(bool do_verification, for(auto& reduce2_ptr : reduce2_ptrs) { - auto argument2_ptr = reduce2_ptr->MakeArgumentPointer( - inLengths2, - inStrides2, - i_outLengths, - i_outStrides, - reduceDims, - alpha, - beta, - ws_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - out_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation_2{static_cast(reduce_total_length)}, - AccElementwiseOperation_2{static_cast(reduce_total_length)}); + InElementwiseOperation_2 in_elementwise_op_2( + static_cast(reduce_total_length)); + AccElementwiseOperation_2 acc_elementwise_op_2( + static_cast(reduce_total_length)); + + auto argument2_ptr = + reduce2_ptr->MakeArgumentPointer(inLengths2, + inStrides2, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + ws_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_2, + acc_elementwise_op_2); if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) continue; diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index ef8fd1115b..4ae1eeda8b 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, {"scales", required_argument, nullptr, 'S'}, {"half", no_argument, nullptr, '?'}, {"double", no_argument, nullptr, '?'}, + {"int8", no_argument, nullptr, '?'}, + {"bf16", no_argument, nullptr, '?'}, {"dumpout", required_argument, nullptr, 'o'}, {"verify", required_argument, nullptr, 'v'}, {"log", required_argument, nullptr, 'l'}, @@ -119,6 +121,8 @@ class AppArgs public: bool use_half = false; bool use_double = false; + bool use_int8 = false; + bool use_bf16 = false; std::vector inLengths; std::vector outLengths; @@ -169,6 +173,8 @@ class AppArgs << std::endl; std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl; std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl; + std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl; + std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl; std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " "comparing with the host-based reduction" << std::endl; @@ -267,6 +273,10 @@ class AppArgs use_half = true; else if(std::string(long_options[option_index].name) == "double") use_double = true; + else if(std::string(long_options[option_index].name) == "int8") + use_int8 = true; + else if(std::string(long_options[option_index].name) == "bf16") + use_bf16 = true; else if(std::string(long_options[option_index].name) == "help") { show_usage(argv[0]); @@ -385,6 +395,71 @@ int profile_reduce(int argc, char* argv[]) args.scales[0], args.scales[1]); } + else if(args.use_int8) + { + if(!args.compType_assigned) + args.compTypeId = appInt8; + + if(args.outType_assigned && (args.outTypeId != appInt8 && args.outTypeId != appInt32)) + args.outTypeId = appInt32; + + if(!args.outType_assigned) + args.outTypeId = appInt8; + + if(args.compTypeId == appInt8) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.reduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else if(args.compTypeId == appInt32) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.reduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else + throw std::runtime_error("Invalid compType assignment!"); + } + else if(args.use_bf16) + { + if(args.outType_assigned && (args.outTypeId != appBFloat16 && args.outTypeId != appFloat)) + args.outTypeId = appFloat; + + if(!args.outType_assigned) + args.outTypeId = appBFloat16; + + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.reduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } else { if(args.compTypeId == appFloat) diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh index fcfe6c960b..0e8424f940 100755 --- a/script/cmake-rocm.sh +++ b/script/cmake-rocm.sh @@ -3,14 +3,14 @@ rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles -MY_PROJECT_SOURCE=../../.. +MY_PROJECT_SOURCE=../ MY_PROJECT_INSTALL=../install.dir cmake \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D BUILD_DEV=OFF \ -D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \ +-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only " \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ diff --git a/script/profile_reduce_no_index.sh b/script/profile_reduce_no_index.sh index a038f3f285..580a7ca1ee 100755 --- a/script/profile_reduce_no_index.sh +++ b/script/profile_reduce_no_index.sh @@ -3,13 +3,16 @@ PRECISION= ##PRECISION=--half ##PRECISION=--double +##PRECISION=--int8 +##PRECISION=--bf16 -if test -n $PRECISION && test "$PRECISION" = "--half"; then +if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then ACCTYPE="-C 1" -else - ACCTYPE="" +elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then + ACCTYPE="-C 2" fi + driver="./bin/ckProfiler" VERIFY="-v $1" @@ -20,10 +23,16 @@ NREPEAT=$3 #### 0 - ADD, 5 - AVG, 7 - NORM2 Operations="0 5 7" +#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported +if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then + Operations=5 +fi + ## for generic validation for op in $Operations; do set -x ####### datatype layout reduce dims op acctype verify init repeats + $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT diff --git a/script/profile_reduce_with_index.sh b/script/profile_reduce_with_index.sh index 5e6a61748a..d4671e3981 100755 --- a/script/profile_reduce_with_index.sh +++ b/script/profile_reduce_with_index.sh @@ -3,6 +3,8 @@ PRECISION= ##PRECISION=--half ##PRECISION=--double +##PRECISION=--int8 +##PRECISION=--bf16 driver="./bin/ckProfiler" @@ -18,6 +20,7 @@ for op in $Operations; do for use_idx in 0 1; do set -x ####### datatype layout reduce dims op use index verify init repeats + $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh new file mode 100755 index 0000000000..95e563c93c --- /dev/null +++ b/script/test_reduce_no_index.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +## The following will be used for CI + +set -x + +## for float +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2 + +## for float16 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 2 1 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 3 1 2 + +## for int8_t +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 2 3 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 3 3 2 + +## for bfloat16 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 0 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 1 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 2 5 2 +bin/test_reduce_no_index -D 64,4,280,82 -R 3 5 2 + +set +x + diff --git a/script/test_reduce_with_index.sh b/script/test_reduce_with_index.sh new file mode 100755 index 0000000000..8e7ed33847 --- /dev/null +++ b/script/test_reduce_with_index.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +## The following will be used for CI + +set -x + +## for float +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2 + +## for float16 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 2 1 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 3 1 2 + +## for int8_t +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 2 3 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 3 3 2 + +## for bfloat16 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 0 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 1 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 2 5 2 +bin/test_reduce_with_index -D 64,4,280,82 -R 3 5 2 + +set +x + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4901c84813..13289443fa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -40,3 +40,4 @@ add_subdirectory(conv2d_fwd) add_subdirectory(convnd_fwd) add_subdirectory(conv2d_bwd_data) add_subdirectory(batched_gemm) +add_subdirectory(reduce) diff --git a/test/reduce/CMakeLists.txt b/test/reduce/CMakeLists.txt new file mode 100644 index 0000000000..4e11b049a8 --- /dev/null +++ b/test/reduce/CMakeLists.txt @@ -0,0 +1,7 @@ +add_test_executable(test_reduce_no_index reduce_no_index.cpp) +add_test_executable(test_reduce_with_index reduce_with_index.cpp) +target_link_libraries(test_reduce_no_index PRIVATE host_tensor) +target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance) +target_link_libraries(test_reduce_with_index PRIVATE host_tensor) +target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance) + diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp new file mode 100644 index 0000000000..911bdf0bb1 --- /dev/null +++ b/test/reduce/reduce_no_index.cpp @@ -0,0 +1,666 @@ +#include "getopt.h" +#include "device_reduce_instance.hpp" +#include "reduction_enums.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_reduction.hpp" +#include "test_util.hpp" +#include "reduce_util.hpp" + +using namespace ck; + +namespace { + +template +static inline std::vector get_invariant_dims(const std::vector& reduceDims) +{ + assert(NumReduceDim == reduceDims.size()); + + int reduceFlag = 0; + + // flag the bits for the reduceDims + for(int i = 0; i < NumReduceDim; i++) + { + reduceFlag |= 1 << reduceDims[i]; + }; + + std::vector invariantDims; + + // collect invariant dimensions + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) == 0) + { + invariantDims.push_back(i); + }; + + return invariantDims; +}; + +// map the data type used by the GPU kernels to the corresponding type used by the host codes +template +struct type_mapping +{ + using OutType = InType; +}; + +template <> +struct type_mapping +{ + using OutType = half_float::half; +}; + +constexpr int Rank = 4; + +constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::AVG; +constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; +constexpr bool PropagateNan = false; +constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES; +constexpr bool NeedIndices = false; + +template +bool test_reduce_no_index_impl(int init_method, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) +{ + using namespace ck::tensor_operation::device; + using namespace ck::tensor_operation::device::device_reduce_instance; + using namespace ck::host_reduce; + + constexpr bool out_support_atomic_add = std::is_same::value; + constexpr bool op_support_atomic_add = true; + constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); + + Tensor in(inLengths); + + std::vector outLengths; + + const auto invariantDims = get_invariant_dims(reduceDims); + + if(reduceDims.size() == Rank) + outLengths.push_back(1); + else + for(auto dim : invariantDims) + outLengths.push_back(inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + + // only used when the OutDataType is bhalf_t + Tensor out_ref_fp32(outLengths); + Tensor out_fp32(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) + out.mData[i] = out_ref.mData[i]; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); + DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + using InElementwiseOperation_0 = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation_0 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_1 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_1 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_2 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_2 = + typename reduce_unary_operator:: + AccElementwiseOperation; + + using DeviceReduceInstPtr0 = + DeviceReducePtr; + using DeviceReduceInstPtr1 = + DeviceReducePtr; + using DeviceReduceInstPtr2 = + DeviceReducePtr; + + std::vector reduce0_ptrs; + std::vector reduce1_ptrs; + std::vector reduce2_ptrs; + + add_device_reduce_instance_threadwise(reduce0_ptrs); + + add_device_reduce_instance_blockwise(reduce0_ptrs); + + if constexpr(use_atomic_add) + { + add_device_reduce_instance_multiblock_atomic_add(reduce0_ptrs); + } + else + { + add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); + }; + + // used for secondary reduction + if constexpr(!use_atomic_add) + { + add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); + }; + + if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) + { + throw std::runtime_error("Wrong! No device REDUCE instance found"); + }; + + bool result = true; + + using HostInDataType = typename type_mapping::OutType; + using HostOutDataType = typename type_mapping::OutType; + using HostAccDataType = typename type_mapping::OutType; + + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); + + hostReduce.Run(alpha, + reinterpret_cast(in.mData.data()), + beta, + reinterpret_cast(out_ref.mData.data()), + nullptr); + + const auto i_inLengths = to_int_vector(inLengths); + const auto i_inStrides = to_int_vector(inStrides); + const auto i_outLengths = to_int_vector(outLengths); + const auto i_outStrides = to_int_vector(outStrides); + + for(auto& reduce_ptr : reduce0_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + + DeviceMem ws_dev(wsSizeInBytes); + + InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); + AccElementwiseOperation_0 acc_elementwise_op_0(static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + nullptr, + ws_dev.GetDeviceBuffer(), + in_elementwise_op_0, + acc_elementwise_op_0); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + (void)invoker_ptr->Run(argument_ptr.get()); + + out_dev.FromDevice(out.mData.data()); + + bool single_result = true; + + if constexpr(std::is_same::value || + std::is_same::value) + { + reduce_util::to_f32_vector(out, out_fp32); + reduce_util::to_f32_vector(out_ref, out_ref_fp32); + single_result = test_util::check_err( + out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); + } + else + { + single_result = + test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + }; + + if(!single_result) + { + std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl; + result = false; + } + }; + + for(auto& reduce_ptr : reduce1_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + + DeviceMem ws_dev(wsSizeInBytes); + + InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); + AccElementwiseOperation_1 acc_elementwise_op_1(static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + nullptr, + ws_dev.GetDeviceBuffer(), + in_elementwise_op_1, + acc_elementwise_op_1); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + (void)invoker_ptr->Run(argument_ptr.get()); + + std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); + std::vector inStrides2{inLengths2[1], 1}; + + for(auto& reduce2_ptr : reduce2_ptrs) + { + InElementwiseOperation_2 in_elementwise_op_2(static_cast(reduce_total_length)); + AccElementwiseOperation_2 acc_elementwise_op_2( + static_cast(reduce_total_length)); + + auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2, + inStrides2, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + ws_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + nullptr, + ws_dev.GetDeviceBuffer(), + in_elementwise_op_2, + acc_elementwise_op_2); + + if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) + continue; + + std::string reduce2_name = reduce2_ptr->GetTypeString(); + + auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); + + (void)invoker2_ptr->Run(argument2_ptr.get()); + + out_dev.FromDevice(out.mData.data()); + + bool single_result = true; + + if constexpr(std::is_same::value || + std::is_same::value) + { + reduce_util::to_f32_vector(out, out_fp32); + reduce_util::to_f32_vector(out_ref, out_ref_fp32); + single_result = test_util::check_err( + out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); + } + else + { + single_result = + test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + }; + + if(!single_result) + { + std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => " + << reduce2_ptr->GetTypeString() << std::endl; + result = false; + } + }; + }; + + return (result); +}; + +} // anonymous namespace + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"reduceDimensions", required_argument, nullptr, 'R'}, + {"scales", required_argument, nullptr, 'S'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + template + static T getSingleValueFromString(const std::string& valueStr) + { + std::istringstream iss(valueStr); + + T ret; + + iss >> ret; + + return (ret); + }; + + template + static std::vector getTypeValuesFromString(const char* cstr_values) + { + std::string valuesStr(cstr_values); + + std::vector values; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = valuesStr.find(',', pos); + while(new_pos != std::string::npos) + { + const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); + + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + pos = new_pos + 1; + new_pos = valuesStr.find(',', pos); + }; + + std::string sliceStr = valuesStr.substr(pos); + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + return (values); + }; + + private: + int option_index = 0; + + public: + std::vector inLengths; + std::vector reduceDims; + std::vector scales; + + int data_type; + int init_method = 1; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths " + "(only 4-d tensor supported)" + << std::endl; + std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce " + "(only 1 or 3 or 4 dimensions supported)" + << std::endl; + std::cout << "--scales or -S, comma separated two float values for alpha and beta" + << std::endl; + std::cout << "Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + unsigned int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; + case 'S': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + scales = getTypeValuesFromString(optarg); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind]); + + if(scales.empty()) + { + scales.push_back(1.0f); + scales.push_back(0.0f); + }; + + if(inLengths.size() != 4 || + (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4)) + return (-1); + + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5) + return (-1); + + return (0); + }; +}; + +bool test_reduce_no_index(int data_type, + int init_method, + std::vector reduceDims, + std::vector inLengths, + float alpha, + float beta) +{ + bool result = true; + + if(data_type == 0) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 1) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 3) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 5) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_no_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + + return (result); +}; + +int main(int argc, char* argv[]) +{ + SimpleAppArgs args; + + bool result = true; + + if(argc == 1) + { + int data_type = 1; + int init_method = 2; + std::vector inLengths{64, 4, 280, 80}; + std::vector> v_reduceDims{ + {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}}; + + for(auto& reduceDims : v_reduceDims) + result = result && test_reduce_no_index( + data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f); + } + else + { + if(args.processArgs(argc, argv) < 0) + { + throw std::runtime_error( + "Invalid input arguments, test_reduce_no_index could not be executed!"); + }; + + result = test_reduce_no_index(args.data_type, + args.init_method, + args.reduceDims, + args.inLengths, + args.scales[0], + args.scales[1]); + } + + std::cout << "test_reduce_no_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl; + + return (result ? 0 : -1); +} diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp new file mode 100644 index 0000000000..e9a7b4896e --- /dev/null +++ b/test/reduce/reduce_util.hpp @@ -0,0 +1,19 @@ +#ifndef REDUCE_UTILS_HPP +#define REDUCE_UTILS_HPP + +#include "data_type.hpp" + +namespace ck { +namespace reduce_util { + +template +void to_f32_vector(const Tensor& src, Tensor& dst) +{ + for(int i = 0; i < src.mData.size(); ++i) + dst.mData[i] = type_convert(src.mData[i]); +} + +} // namespace reduce_util + +} // namespace ck +#endif diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp new file mode 100644 index 0000000000..4c51fad550 --- /dev/null +++ b/test/reduce/reduce_with_index.cpp @@ -0,0 +1,669 @@ +#include "getopt.h" +#include "device_reduce_instance.hpp" +#include "reduction_enums.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_reduction.hpp" +#include "test_util.hpp" +#include "reduce_util.hpp" + +using namespace ck; + +namespace { + +template +static inline std::vector get_invariant_dims(const std::vector& reduceDims) +{ + assert(NumReduceDim == reduceDims.size()); + + int reduceFlag = 0; + + // flag the bits for the reduceDims + for(int i = 0; i < NumReduceDim; i++) + { + reduceFlag |= 1 << reduceDims[i]; + }; + + std::vector invariantDims; + + // collect invariant dimensions + for(int i = 0; i < Rank; i++) + if((reduceFlag & (1 << i)) == 0) + { + invariantDims.push_back(i); + }; + + return invariantDims; +}; + +// map the data type used by the GPU kernels to the corresponding type used by the host codes +template +struct type_mapping +{ + using OutType = InType; +}; + +template <> +struct type_mapping +{ + using OutType = half_float::half; +}; + +constexpr int Rank = 4; + +constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::AMAX; +constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; +constexpr bool PropagateNan = false; +constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::FLATTENED_INDICES; +constexpr bool NeedIndices = true; + +template +bool test_reduce_with_index_impl(int init_method, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) +{ + using namespace ck::tensor_operation::device; + using namespace ck::tensor_operation::device::device_reduce_instance; + using namespace ck::host_reduce; + + Tensor in(inLengths); + + std::vector outLengths; + + const auto invariantDims = get_invariant_dims(reduceDims); + + if(reduceDims.size() == Rank) + outLengths.push_back(1); + else + for(auto dim : invariantDims) + outLengths.push_back(inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + Tensor out_indices_ref(outLengths); + Tensor out_indices(outLengths); + + // only used when the OutDataType is bhalf_t + Tensor out_ref_fp32(outLengths); + Tensor out_fp32(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) + out.mData[i] = out_ref.mData[i]; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); + DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; + + DeviceMem out_indices_dev(indicesSizeInBytes); + + using InElementwiseOperation_0 = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation_0 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_1 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_1 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_2 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_2 = + typename reduce_unary_operator:: + AccElementwiseOperation; + + using DeviceReduceInstPtr0 = + DeviceReducePtr; + using DeviceReduceInstPtr1 = + DeviceReducePtr; + using DeviceReduceInstPtr2 = + DeviceReducePtr; + + std::vector reduce0_ptrs; + std::vector reduce1_ptrs; + std::vector reduce2_ptrs; + + add_device_reduce_instance_threadwise(reduce0_ptrs); + + add_device_reduce_instance_blockwise(reduce0_ptrs); + + add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); + + add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); + + if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) + { + throw std::runtime_error("Wrong! No device REDUCE instance found"); + }; + + bool result = true; + + using HostInDataType = typename type_mapping::OutType; + using HostOutDataType = typename type_mapping::OutType; + using HostAccDataType = typename type_mapping::OutType; + + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); + + hostReduce.Run(alpha, + reinterpret_cast(in.mData.data()), + beta, + reinterpret_cast(out_ref.mData.data()), + out_indices_ref.mData.data()); + + const auto i_inLengths = to_int_vector(inLengths); + const auto i_inStrides = to_int_vector(inStrides); + const auto i_outLengths = to_int_vector(outLengths); + const auto i_outStrides = to_int_vector(outStrides); + + for(auto& reduce_ptr : reduce0_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + + DeviceMem ws_dev(wsSizeInBytes); + + InElementwiseOperation_0 in_elementwise_op_0(static_cast(reduce_total_length)); + AccElementwiseOperation_0 acc_elementwise_op_0(static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_0, + acc_elementwise_op_0); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + (void)invoker_ptr->Run(argument_ptr.get()); + + out_dev.FromDevice(out.mData.data()); + + bool single_result = true; + + if constexpr(std::is_same::value || + std::is_same::value) + { + reduce_util::to_f32_vector(out, out_fp32); + reduce_util::to_f32_vector(out_ref, out_ref_fp32); + single_result = test_util::check_err( + out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); + } + else + { + single_result = + test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + }; + + if(NeedIndices) + { + out_indices_dev.FromDevice(out_indices.mData.data()); + single_result = single_result && test_util::check_err(out_indices_ref.mData, + out_indices.mData, + "Error: incorrect index result!"); + }; + + if(!single_result) + { + std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl; + result = false; + } + }; + + for(auto& reduce_ptr : reduce1_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + + DeviceMem ws_dev(wsSizeInBytes); + + InElementwiseOperation_1 in_elementwise_op_1(static_cast(reduce_total_length)); + AccElementwiseOperation_1 acc_elementwise_op_1(static_cast(reduce_total_length)); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_1, + acc_elementwise_op_1); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + std::string reduce_name = reduce_ptr->GetTypeString(); + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + (void)invoker_ptr->Run(argument_ptr.get()); + + std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); + std::vector inStrides2{inLengths2[1], 1}; + + for(auto& reduce2_ptr : reduce2_ptrs) + { + InElementwiseOperation_2 in_elementwise_op_2(static_cast(reduce_total_length)); + AccElementwiseOperation_2 acc_elementwise_op_2( + static_cast(reduce_total_length)); + + auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2, + inStrides2, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + ws_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + in_elementwise_op_2, + acc_elementwise_op_2); + + if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) + continue; + + std::string reduce2_name = reduce2_ptr->GetTypeString(); + + auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); + + (void)invoker2_ptr->Run(argument2_ptr.get()); + + out_dev.FromDevice(out.mData.data()); + + bool single_result = true; + + if constexpr(std::is_same::value || + std::is_same::value) + { + reduce_util::to_f32_vector(out, out_fp32); + reduce_util::to_f32_vector(out_ref, out_ref_fp32); + single_result = test_util::check_err( + out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!"); + } + else + { + single_result = + test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!"); + }; + + if(NeedIndices) + { + out_indices_dev.FromDevice(out_indices.mData.data()); + single_result = + single_result && test_util::check_err(out_indices_ref.mData, + out_indices.mData, + "Error: incorrect index result!"); + }; + + if(!single_result) + { + std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => " + << reduce2_ptr->GetTypeString() << std::endl; + result = false; + } + }; + }; + + return (result); +}; + +} // anonymous namespace + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"reduceDimensions", required_argument, nullptr, 'R'}, + {"scales", required_argument, nullptr, 'S'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + template + static T getSingleValueFromString(const std::string& valueStr) + { + std::istringstream iss(valueStr); + + T ret; + + iss >> ret; + + return (ret); + }; + + template + static std::vector getTypeValuesFromString(const char* cstr_values) + { + std::string valuesStr(cstr_values); + + std::vector values; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = valuesStr.find(',', pos); + while(new_pos != std::string::npos) + { + const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); + + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + pos = new_pos + 1; + new_pos = valuesStr.find(',', pos); + }; + + std::string sliceStr = valuesStr.substr(pos); + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + return (values); + }; + + private: + int option_index = 0; + + public: + std::vector inLengths; + std::vector reduceDims; + std::vector scales; + + int data_type; + int init_method = 1; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths " + "(only 4-d tensor supported)" + << std::endl; + std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce " + "(only 1 or 3 or 4 dimensions supported)" + << std::endl; + std::cout << "--scales or -S, comma separated two float values for alpha and beta" + << std::endl; + std::cout << "Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + unsigned int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; + case 'S': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + scales = getTypeValuesFromString(optarg); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind]); + + if(scales.empty()) + { + scales.push_back(1.0f); + scales.push_back(0.0f); + }; + + if(inLengths.size() != 4 || + (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4)) + return (-1); + + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5) + return (-1); + + return (0); + }; +}; + +bool test_reduce_with_index(int data_type, + int init_method, + std::vector reduceDims, + std::vector inLengths, + float alpha, + float beta) +{ + bool result = true; + + if(data_type == 0) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 1) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 3) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + else if(data_type == 5) + { + switch(reduceDims.size()) + { + case 1: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 3: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + case 4: + result = test_reduce_with_index_impl( + init_method, inLengths, reduceDims, alpha, beta); + break; + }; + } + + return (result); +}; + +int main(int argc, char* argv[]) +{ + SimpleAppArgs args; + + bool result = true; + + if(argc == 1) + { + int data_type = 1; + int init_method = 2; + std::vector inLengths{64, 4, 280, 80}; + std::vector> v_reduceDims{ + {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}}; + + for(auto& reduceDims : v_reduceDims) + result = result && test_reduce_with_index( + data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f); + } + else + { + if(args.processArgs(argc, argv) < 0) + { + throw std::runtime_error( + "Invalid input arguments, test_reduce_with_index could not be executed!"); + }; + + result = test_reduce_with_index(args.data_type, + args.init_method, + args.reduceDims, + args.inLengths, + args.scales[0], + args.scales[1]); + } + + std::cout << "test_reduce_with_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl; + + return (result ? 0 : -1); +}