diff --git a/Dockerfile b/Dockerfile index 52e4dfe4fd..6da9e587f9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:18.04 -ARG ROCMVERSION=4.3.1 +ARG ROCMVERSION=5.0 ARG OSDB_BKC_VERSION RUN set -xe diff --git a/composable_kernel/include/tensor_operation/element_wise_operation.hpp b/composable_kernel/include/tensor_operation/element_wise_operation.hpp index 487104c3cf..2c45d1f544 100644 --- a/composable_kernel/include/tensor_operation/element_wise_operation.hpp +++ b/composable_kernel/include/tensor_operation/element_wise_operation.hpp @@ -175,6 +175,161 @@ struct RequantReluRequant float scaleRelu_; }; +// Unary operators are usually called element-wisely before/after the reduction is executed on the +// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 + +template +struct UnaryIdentic; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const { y = x; }; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const + { + y = x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; }; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const { y = x; }; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const + { + y = x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +template <> +struct UnaryIdentic +{ + __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }; +}; + +template +struct UnarySquare; + +template <> +struct UnarySquare +{ + __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const { y = x * x; }; +}; + +template <> +struct UnarySquare +{ + __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const + { + y = x * x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +template <> +struct UnarySquare +{ + __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const { y = x * x; }; +}; + +template <> +struct UnarySquare +{ + __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const + { + y = x * x / type_convert(divider_); + }; + + int32_t divider_ = 1; +}; + +template +struct UnaryAbs; + +template <> +struct UnaryAbs +{ + __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); }; +}; + +template <> +struct UnaryAbs +{ + __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); }; +}; + +template <> +struct UnaryAbs +{ + __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); }; +}; + +template +struct UnarySqrt; + +template <> +struct UnarySqrt +{ + __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); }; +}; + +template <> +struct UnarySqrt +{ + __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; }; + + __host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); }; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/composable_kernel/include/tensor_operation/gridwise_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_blockwise.hpp new file mode 100644 index 0000000000..a5202888f2 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_blockwise.hpp @@ -0,0 +1,925 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP +#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP + +#include "data_type.hpp" +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_accumulate.hpp" +#include "reduction_functions_blockwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const OutElementwiseOperation acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + const IndexDataType* const __restrict__ p_ws_indices_global, + IndexDataType* const __restrict__ p_indices_global) +{ + if constexpr(!NeedIndices) + { + GridwiseReduction::Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); + } + else + { + GridwiseReduction::RunWithIndex(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); + }; +}; + +template +__global__ void +kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const OutElementwiseOperation acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + const IndexDataType* const __restrict__ p_ws_indices_global, + IndexDataType* const __restrict__ p_indices_global) +{ + if constexpr(!NeedIndices) + { + GridwiseReduction::Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); + } + else + { + GridwiseReduction::RunSecondCallWithIndex(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_ws_indices_global, + p_indices_global); + }; +}; + +template +struct GridwiseReduction_mk_to_m_blockwise +{ + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + static constexpr auto buffer_1d_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + template + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + + static constexpr auto I0 = Number<0>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const OutElementwiseOperation& acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + const IndexDataType* const __restrict__ p_ws_indices_global, + IndexDataType* const __restrict__ p_indices_global) + { + using BlockwiseReduce = PartitionedBlockwiseReductionOn1dBuffer; + using Accumulation = + detail::AccumulateWithNanCheck; + + (void)p_ws_indices_global; + (void)p_indices_global; + + // LDS + __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + auto out_global_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + + auto block_reduce_buf = + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; + + index_t reducedTiles = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); + }); + + // reduce on each thread-local slice + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + }); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < toReduceTiles); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(reorder_thread_cluster) + { + block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) = + accu_value_buf[I]; + } + else + block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) = + accu_value_buf[I]; + + accu_value_buf(I) = zeroVal; + + __syncthreads(); + + BlockwiseReduce::Reduce( + block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + if constexpr(!BetaIsZero) + { + if(!float_equal_zero{}(beta)) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSize, + 1, + false>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + }); + }; + }; + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run( + reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); + } + }; + + __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const OutElementwiseOperation& acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + const IndexDataType* const __restrict__ p_ws_indices_global, + IndexDataType* const __restrict__ p_indices_global) + { + using BlockwiseReduceWithIndex = + PartitionedBlockwiseReductionWithIndexOn1dBuffer; + + using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + + (void)p_ws_indices_global; + + // LDS + __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; + __shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize]; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + auto out_global_val_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + auto out_global_idx_buf = make_dynamic_buffer( + p_indices_global, out_grid_desc_m.GetElementSpaceSize()); + + auto block_reduce_val_buf = + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + auto block_reduce_idx_buf = + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + + StaticBuffer + in_thread_val_buf; + + StaticBuffer + in_thread_idx_buf; + + StaticBuffer accu_value_buf; + StaticBuffer + accu_index_buf; + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + index_t indexOffset = 0; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) = zeroVal; + accu_index_buf(I) = 0; + }); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; + + index_t reducedTiles = 0; + do + { + // load the thread slice + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + // initialize the indices for the per-thread to-reduce values + in_thread_idx_buf(offset) = + indexOffset + thread_k_cluster_id * KThreadSliceSize + J(); + + // do element-wise pre-reduction operation + in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset)); + }); + + AccDataType tmpValue = zeroVal; + IndexDataType tmpIndex = 0; + + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + // reduce on the dim1 thread slice + AccumulationWithIndex::Calculate( + tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + }); + + // store thread local value to LDS for parallel reduction + if constexpr(reorder_thread_cluster) + { + block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpIndex; + } + else + { + block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpIndex; + } + + __syncthreads(); + + BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf, + block_reduce_idx_buf, + tmpValue, + tmpIndex, + thread_m_cluster_id, + thread_k_cluster_id); + + AccumulationWithIndex::Calculate( + accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + indexOffset += K_BlockTileSize; + reducedTiles++; + } while(reducedTiles < toReduceTiles); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + // for indiced operation, acc_elementwise_op shoud do nothing + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + if constexpr(!BetaIsZero) + { + if(!float_equal_zero{}(beta)) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSize, + 1, + false>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + }); + }; + }; + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + false>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + false>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_val_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf, + out_grid_desc_m, + out_global_val_buf); + threadwise_dst_idx_store.Run(reduced_data_desc, + make_tuple(I0), + accu_index_buf, + out_grid_desc_m, + out_global_idx_buf); + } + }; + + __device__ static void + RunSecondCallWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const OutElementwiseOperation acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_ws_values_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + const IndexDataType* const __restrict__ p_ws_indices_global, + IndexDataType* const __restrict__ p_indices_global) + { + using BlockwiseReduceWithIndex = + PartitionedBlockwiseReductionWithIndexOn1dBuffer; + + using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + + (void)in_elementwise_op; + + // LDS + __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; + __shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize]; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto src_global_val_buf = + make_dynamic_buffer(p_ws_values_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + const auto src_global_idx_buf = make_dynamic_buffer( + p_ws_indices_global, in_grid_desc_m_k.GetElementSpaceSize()); + auto out_global_val_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + auto out_global_idx_buf = make_dynamic_buffer( + p_indices_global, out_grid_desc_m.GetElementSpaceSize()); + + auto block_reduce_val_buf = + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + auto block_reduce_idx_buf = + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + + StaticBuffer + in_thread_val_buf; + + StaticBuffer + in_thread_idx_buf; + + StaticBuffer accu_value_buf; + StaticBuffer + accu_index_buf; + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2< + IndexDataType, + IndexDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + // index_t indexOffset = 0; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) = zeroVal; + accu_index_buf(I) = 0; + }); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize; + + index_t reducedTiles = 0; + do + { + // load the thread slice + threadwise_src_val_load.Run(in_grid_desc_m_k, + src_global_val_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + threadwise_src_idx_load.Run(in_grid_desc_m_k, + src_global_idx_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_idx_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + AccDataType tmpValue = zeroVal; + IndexDataType tmpIndex = 0; + + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + // reduce on the dim1 thread slice + AccumulationWithIndex::Calculate( + tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + }); + + // store thread local value to LDS for parallel reduction + if constexpr(reorder_thread_cluster) + { + block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpIndex; + } + else + { + block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpIndex; + } + + __syncthreads(); + + BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf, + block_reduce_idx_buf, + tmpValue, + tmpIndex, + thread_m_cluster_id, + thread_k_cluster_id); + + AccumulationWithIndex::Calculate( + accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + }); + + threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + // indexOffset += K_BlockTileSize; + reducedTiles++; + } while(reducedTiles < toReduceTiles); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + // for indiced operation, acc_elementwise_op shoud do nothing + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + if constexpr(!BetaIsZero) + { + if(!float_equal_zero{}(beta)) + { + StaticBuffer + priorDstValueBuf; + + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + OutDstVectorSize, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValueBuf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValueBuf[I] * beta); + }); + }; + }; + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + out_grid_desc_m, + make_multi_index(block_global_1d_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_val_store.Run(reduced_data_desc, + make_tuple(I0), + accu_value_buf, + out_grid_desc_m, + out_global_val_buf); + threadwise_dst_idx_store.Run(reduced_data_desc, + make_tuple(I0), + accu_index_buf, + out_grid_desc_m, + out_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_atomic_add.hpp b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_atomic_add.hpp new file mode 100644 index 0000000000..23955e81a9 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_atomic_add.hpp @@ -0,0 +1,268 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP +#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_accumulate.hpp" +#include "reduction_functions_blockwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void +kernel_reduce_multiblock_atocmi_add(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType* const __restrict__ p_out_global) +{ + GridwiseReduction::Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + block_group_size, + num_k_block_tile_iteration, + alpha, + p_in_global, + p_out_global); +}; + +template +struct GridwiseReduction_mk_to_m_multiblock_atomic_add +{ + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + static constexpr auto buffer_1d_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + using blockwise_reduce = PartitionedBlockwiseReductionOn1dBuffer; + + template + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + + static constexpr auto I0 = Number<0>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + using Accumulation = detail::AccumulateWithNanCheck; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType* const __restrict__ p_out_global) + { + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + // LDS + __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + + const auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + auto out_global_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + + auto block_reduce_buf = + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / block_group_size; + const index_t block_local_id = block_global_id % block_group_size; + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>( + in_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t reducedTiles = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); + }); + + // reduce on each thread-local slice + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + }); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + // Each block executes multiple parallel reductions on the LDS, and by atomic-adding its + // reduced output to the global location corresponding to each invariant dimension to get a + // consistent reduced result for that invariant dimension. due to the using of vector_load, + // each block/thread is involved into multiple invarirant dimensions. + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(reorder_thread_cluster) + { + block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) = + accu_value_buf[I]; + } + else + block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) = + accu_value_buf[I]; + + accu_value_buf(I) = zeroVal; + + __syncthreads(); + + blockwise_reduce::Reduce( + block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if(thread_k_cluster_id == 0) + { + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + } + }); + + if(thread_k_cluster_id == 0) + { + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::AtomicAdd, + 1, + true>( + out_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run( + reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_partial_reduce.hpp new file mode 100644 index 0000000000..85ccc2b995 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_multiblock_partial_reduce.hpp @@ -0,0 +1,514 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP +#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_accumulate.hpp" +#include "reduction_functions_blockwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void +kernel_partial_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k, + const WorkspaceDesc_M_K workspace_desc_m_k, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + const InDataType* const __restrict__ p_src_global, + AccDataType* const __restrict__ p_ws_values_global, + IndexDataType* const __restrict__ p_ws_indices_global) + +{ + if constexpr(!NeedIndices) + { + GridwiseReduction::Run(in_grid_desc_m_k, + workspace_desc_m_k, + in_elementwise_op, + acc_elementwise_op, + block_group_size, + num_k_block_tile_iteration, + p_src_global, + p_ws_values_global, + p_ws_indices_global); + } + else + { + GridwiseReduction::RunWithIndex(in_grid_desc_m_k, + workspace_desc_m_k, + in_elementwise_op, + acc_elementwise_op, + block_group_size, + num_k_block_tile_iteration, + p_src_global, + p_ws_values_global, + p_ws_indices_global); + }; +}; + +template +struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce +{ + static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0); + + static constexpr auto buffer1dDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + template + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + + static constexpr auto I0 = Number<0>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const WorkspaceDesc_M_K& workspace_desc_m_k, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + const InDataType* const __restrict__ p_src_global, + AccDataType* const __restrict__ p_ws_values_global, + IndexDataType* const __restrict__ p_ws_indices_global) + { + using BlockwiseReduce = PartitionedBlockwiseReductionOn1dBuffer; + + using Accumulation = + detail::AccumulateWithNanCheck; + + (void)p_ws_indices_global; + (void)acc_elementwise_op; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + // LDS + __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + + const auto in_global_buf = + make_dynamic_buffer(p_src_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + auto workspace_global_buf = make_dynamic_buffer( + p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); + + auto block_reduce_buf = + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / block_group_size; + const index_t block_local_id = block_global_id % block_group_size; + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>( + in_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t reducedTiles = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); + }); + + // reduce on each thread-local slice + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + }); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + + constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + + // Each block executes multiple parallel reductions on the LDS, and due to the using of + // vector_load, each block/thread is involved into multiple invarirant dimensions. + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(reorder_thread_cluster) + { + block_reduce_buf(thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id) = + accu_value_buf[I]; + } + else + block_reduce_buf(thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id) = + accu_value_buf[I]; + + accu_value_buf(I) = zeroVal; + + __syncthreads(); + + BlockwiseReduce::Reduce( + block_reduce_buf, accu_value_buf(I), thread_m_cluster_id, thread_k_cluster_id); + }); + + if(thread_k_cluster_id == 0) + { + auto threadwise_workspace_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0, 1>, + 1, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + workspace_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + threadwise_workspace_store.Run(reduced_data_desc, + make_tuple(I0, I0), + accu_value_buf, + workspace_desc_m_k, + workspace_global_buf); + } + }; + + __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k, + const WorkspaceDesc_M_K& workspace_desc_m_k, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + index_t block_group_size, + index_t num_k_block_tile_iteration, + const InDataType* const __restrict__ p_src_global, + AccDataType* const __restrict__ p_ws_values_global, + IndexDataType* const __restrict__ p_ws_indices_global) + { + using BlockwiseReduceWithIndex = + PartitionedBlockwiseReductionWithIndexOn1dBuffer; + + using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + + (void)acc_elementwise_op; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + // LDS + __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; + __shared__ index_t p_block_reduce_idx_buffer[BlockSize]; + + const auto in_global_buf = + make_dynamic_buffer(p_src_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + auto workspace_global_val_buf = make_dynamic_buffer( + p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); + auto workspace_global_idx_buf = make_dynamic_buffer( + p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize()); + + auto block_reduce_val_buf = + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + auto block_reduce_idx_buf = + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + + StaticBuffer + in_thread_val_buf; + StaticBuffer + in_thread_idx_buf; + + StaticBuffer accu_value_buf; + StaticBuffer + accu_index_buf; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / block_group_size; + const index_t block_local_id = block_global_id % block_group_size; + const index_t thread_m_cluster_id = + reorder_thread_cluster ? thread_local_id % MThreadClusterSize + : ((thread_local_id / KThreadClusterSize) % MThreadClusterSize); + const index_t thread_k_cluster_id = + reorder_thread_cluster ? ((thread_local_id / MThreadClusterSize) % KThreadClusterSize) + : thread_local_id % KThreadClusterSize; + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>( + in_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize); + + index_t indexOffset = block_local_id * reduceSizePerBlock; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) = zeroVal; + accu_index_buf(I) = 0; + }); + + index_t reducedTiles = 0; + do + { + // load the thread slice + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_val_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + // initialize the indices for the per-thread to-reduce values + in_thread_idx_buf(offset) = + indexOffset + thread_k_cluster_id * KThreadSliceSize + J(); + + // do element-wise pre-reduction operation + in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset)); + }); + + AccDataType tmpValue = zeroVal; + IndexDataType tmpIndex = 0; + + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + // reduce on the dim1 thread slice + AccumulationWithIndex::Calculate( + tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + }); + + // store thread local value to LDS for parallel reduction + if constexpr(reorder_thread_cluster) + { + block_reduce_val_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_k_cluster_id * MThreadClusterSize + + thread_m_cluster_id) = tmpIndex; + } + else + { + block_reduce_val_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpValue; + block_reduce_idx_buf(thread_m_cluster_id * KThreadClusterSize + + thread_k_cluster_id) = tmpIndex; + } + + __syncthreads(); + + BlockwiseReduceWithIndex::Reduce(block_reduce_val_buf, + block_reduce_idx_buf, + tmpValue, + tmpIndex, + thread_m_cluster_id, + thread_k_cluster_id); + + AccumulationWithIndex::Calculate( + accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + indexOffset += K_BlockTileSize; + + reducedTiles++; + } while(reducedTiles < num_k_block_tile_iteration); + + constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + + if(thread_k_cluster_id == 0) + { + auto threadwise_workspace_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0, 1>, + 1, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + workspace_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + auto threadwise_workspace_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0, 1>, + 1, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + workspace_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + threadwise_workspace_val_store.Run(reduced_data_desc, + make_tuple(I0, I0), + accu_value_buf, + workspace_desc_m_k, + workspace_global_val_buf); + threadwise_workspace_idx_store.Run(reduced_data_desc, + make_tuple(I0, I0), + accu_index_buf, + workspace_desc_m_k, + workspace_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_2d_reduction_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_threadwise.hpp new file mode 100644 index 0000000000..c5e92b3019 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_2d_reduction_threadwise.hpp @@ -0,0 +1,435 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP +#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP + +#include "data_type.hpp" +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_accumulate.hpp" +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k, + const OutGridDesc_M out_grid_desc_m, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + IndexDataType* const __restrict__ p_indices_global) +{ + if constexpr(!NeedIndices) + { + GridwiseReduction::Run(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_indices_global); + } + else + { + GridwiseReduction::RunWithIndices(in_grid_desc_m_k, + out_grid_desc_m, + in_elementwise_op, + acc_elementwise_op, + alpha, + p_in_global, + beta, + p_out_global, + p_indices_global); + }; +}; + +template +struct GridwiseReduction_mk_to_m_threadwise +{ + template + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + + static constexpr auto I0 = Number<0>{}; + + __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + IndexDataType* const __restrict__ p_indices_global) + { + + using Accumulation = + detail::AccumulateWithNanCheck; + + (void)p_indices_global; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + auto dst_global_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); + + index_t reducedLength = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); + }); + + // reduce on each thread-local slice + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + }); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + reducedLength += KThreadSliceSize; + } while(reducedLength < toReduceLength); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + }); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + if constexpr(!BetaIsZero) + { + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>( + out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); + + StaticBuffer + priorDstValue_buf; + + threadwise_dst_load.Run(out_grid_desc_m, + dst_global_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValue_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValue_buf[I] * beta); + }); + }; + }; + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + false>( + out_grid_desc_m, + make_multi_index(thread_global_1d_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_store.Run( + reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf); + }; + + __device__ static void RunWithIndices(const InGridDesc_M_K& in_grid_desc_m_k, + const OutGridDesc_M& out_grid_desc_m, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op, + AccDataType alpha, + const InDataType* const __restrict__ p_in_global, + OutDataType beta, + OutDataType* const __restrict__ p_out_global, + IndexDataType* const __restrict__ p_indices_global) + { + using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + (void)acc_elementwise_op; + + const auto zeroVal = ReduceOperation::GetReductionZeroVal(); + + const auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); + auto out_global_val_buf = make_dynamic_buffer( + p_out_global, out_grid_desc_m.GetElementSpaceSize()); + auto out_global_idx_buf = make_dynamic_buffer( + p_indices_global, out_grid_desc_m.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + StaticBuffer accu_value_buf; + StaticBuffer + accu_index_buf; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) = zeroVal; + accu_index_buf(I) = 0; + }); + + const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + using ThreadBufferLengths = Sequence; + constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2< + InDataType, + AccDataType, + InGridDesc_M_K, + decltype(thread_buffer_desc), + ThreadBufferLengths, + typename conditional, Sequence<0, 1>>::type, + InSrcVectorDim, + InSrcVectorSize, + 1, + false>(in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize); + + index_t indexStart = 0; + index_t reducedLength = 0; + do + { + threadwise_src_load.Run(in_grid_desc_m_k, + in_global_buf, + thread_buffer_desc, + make_tuple(I0, I0), + in_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + // do element-wise pre-reduction operation + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + + in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); + }); + + // reduce on each thread-local slice + static_for<0, KThreadSliceSize, 1>{}([&](auto J) { + constexpr auto offset = I * Number{} + J; + AccumulationWithIndex::Calculate(accu_value_buf(I), + in_thread_buf[offset], + accu_index_buf(I), + indexStart + J); + }); + }); + + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); + + indexStart += KThreadSliceSize; + reducedLength += KThreadSliceSize; + } while(reducedLength < toReduceLength); + + // for indiced operation, acc_elementwise_op shoud do nothing + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + acc_elementwise_op(accu_value_buf(I), accu_value_buf(I)); + + accu_value_buf(I) *= alpha; + }); + + constexpr auto reduced_data_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + if constexpr(!BetaIsZero) + { + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>( + out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); + + StaticBuffer + priorDstValue_buf; + + threadwise_dst_load.Run(out_grid_desc_m, + out_global_val_buf, + reduced_data_desc, + make_tuple(I0), + priorDstValue_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + accu_value_buf(I) += type_convert(priorDstValue_buf[I] * beta); + }); + }; + }; + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + false>( + out_grid_desc_m, + make_multi_index(thread_global_1d_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence, + Sequence<0>, + 0, + OutDstVectorSize, + InMemoryDataOperationEnum_t::Set, + 1, + false>( + out_grid_desc_m, + make_multi_index(thread_global_1d_id * MThreadSliceSize), + PassThroughOp{}); + + threadwise_dst_val_store.Run( + reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_val_buf); + + threadwise_dst_idx_store.Run( + reduced_data_desc, make_tuple(I0), accu_index_buf, out_grid_desc_m, out_global_idx_buf); + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp deleted file mode 100644 index 9ee63312a3..0000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ /dev/null @@ -1,623 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_blockwise.hpp" - -#include "blockwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_blockwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - using blockwise_reduce = - BlockwiseReduction_2d_block_buffer; - - static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_buf); - - __syncthreads(); - - // do element-wise pre-reduction operation - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - __shared__ int block_indices_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - auto in_block_idx_buf = - make_dynamic_buffer(block_indices_buffer, BlockBufferSize); - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - int indexOffset = 0; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - // load block data from global to LDS, no use of double buffers (to be improved) - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); - - __syncthreads(); - - // construct the indices for the current toReduce blocks - blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - indexOffset += BlockBufferSize; - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - __shared__ int block_indices_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = make_dynamic_buffer( - ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - auto in_block_idx_buf = - make_dynamic_buffer(block_indices_buffer, BlockBufferSize); - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_val_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - auto blockwise_src_idx_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - int, - int, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - // load block data from global to LDS, no use of double buffers (to be improved) - blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf); - blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf); - blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf); - blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf); - - __syncthreads(); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp deleted file mode 100644 index 1ac24b7eac..0000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ /dev/null @@ -1,501 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_threadwise.hpp" - -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_direct_threadwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using threadwise_reduce = ThreadReduce; - - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // do element-wise pre-reduction operation - threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using threadwise_reduce = ThreadReduce; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - index_t indexStart = 0; - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce2( - in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart); - - indexStart += GredThreadBufferLength; - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = make_dynamic_buffer( - ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_val_buf; - StaticBuffer in_thread_idx_buf; - - using threadwise_reduce = ThreadReduceWithIndicesInput; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_val_load.Run(src2dDesc, - src_global_val_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_val_buf); - threadwise_src_idx_load.Run(src2dDesc, - src_global_idx_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_idx_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce( - in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); - - threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp deleted file mode 100644 index 402d4e0d02..0000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp +++ /dev/null @@ -1,542 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_warpwise.hpp" - -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_direct_warpwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using warpwise_reduce = - WarpReduce; - - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // do element-wise pre-reduction operation - warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf(I0) * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using warpwise_reduce = - WarpReduce; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - index_t indexOffset = 0; - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce2( - in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset); - - indexOffset += warpSize * GredAccessesPerThreadInWarp; - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = make_dynamic_buffer( - ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_val_buf; - StaticBuffer - in_thread_idx_buf; - - using warpwise_reduce = WarpReduceWithIndicesInput; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_val_load.Run(src2dDesc, - src_global_val_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_val_buf); - threadwise_src_idx_load.Run(src2dDesc, - src_global_idx_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_idx_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce( - in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); - - threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp deleted file mode 100644 index dda2efa884..0000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp +++ /dev/null @@ -1,376 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_blockwise.hpp" - -#include "blockwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_multiblock -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = typename reduce_unary_operator::posUnaryOp; - - static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - using blockwise_reduce = - BlockwiseReduction_2d_block_buffer; - - static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global) - { - (void)ws_indices_global; - - (void)alpha; // unused - (void)beta; // unused - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto workspace_global_buf = make_dynamic_buffer( - ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - - auto in_block_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / BlkGroupSize; - const index_t block_local_id = block_global_id % BlkGroupSize; - - const index_t reduceSizePerBlock = - (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / - BlockBufferSize) * - BlockBufferSize; - - constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>( - src2dDesc, - make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_buf); - __syncthreads(); - - // do element-wise pre-reduction operation - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - const auto workspace_desc = - make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - auto threadwise_workspace_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - threadwise_workspace_store.Run(ReducedDataDesc, - make_tuple(I0), - accuValue_buf, - workspace_desc, - workspace_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global) - { - (void)alpha; // unused - (void)beta; // unused - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - // LDS - __shared__ compType p_in_block_values_buffer[BlockBufferSize]; - __shared__ int p_in_block_indices_buffer[BlockBufferSize]; - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert(zeroVal)); - auto workspace_global_val_buf = make_dynamic_buffer( - ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - auto workspace_global_idx_buf = make_dynamic_buffer( - ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_values_buffer, BlockBufferSize); - auto in_block_idx_buf = make_dynamic_buffer( - p_in_block_indices_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / BlkGroupSize; - const index_t block_local_id = block_global_id % BlkGroupSize; - - const index_t reduceSizePerBlock = - (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / - BlockBufferSize) * - BlockBufferSize; - - constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>( - src2dDesc, - make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; - - int indexOffset = block_local_id * reduceSizePerBlock; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); - - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); - - __syncthreads(); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - indexOffset += BlockBufferSize; - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - const auto workspace_desc = - make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - auto threadwise_workspace_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - auto threadwise_workspace_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - threadwise_workspace_val_store.Run(ReducedDataDesc, - make_tuple(I0), - accuValue_buf, - workspace_desc, - workspace_global_val_buf); - threadwise_workspace_idx_store.Run(ReducedDataDesc, - make_tuple(I0), - accuIndex_buf, - workspace_desc, - workspace_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_set_buffer_value.hpp b/composable_kernel/include/tensor_operation/gridwise_set_buffer_value.hpp new file mode 100644 index 0000000000..5293049024 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_set_buffer_value.hpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_SET_BUFFER_VALUE_HPP +#define CK_GRIDWISE_SET_BUFFER_VALUE_HPP + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffer_desc, + DataType* const __restrict__ p_global, + DataType value) + +{ + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; + + constexpr auto I0 = Number<0>{}; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + + const index_t thread_global_id = block_global_id * BlockSize + thread_local_id; + + StaticBuffer value_buf; + + value_buf(I0) = value; + + constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + auto global_buf = make_dynamic_buffer( + p_global, grid_1d_buffer_desc.GetElementSpaceSize()); + + if(thread_global_id < grid_1d_buffer_desc.GetElementSize()) + { + auto threadwise_store = ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>( + grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{}); + + threadwise_store.Run( + val_buff_desc, make_tuple(I0), value_buf, grid_1d_buffer_desc, global_buf); + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp index ff21118d24..5bb85b9685 100644 --- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp @@ -30,240 +30,154 @@ #include "reduction_common.hpp" #include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" +#include "reduction_functions_accumulate.hpp" namespace ck { -template -struct BlockwiseReduction_2d_block_buffer +template +struct PartitionedBlockwiseReductionOn1dBuffer { - using compType = typename opReduce::dataType; + static constexpr auto buffer_1d_desc = Buffer1dDescType{}; - static constexpr auto buffer2dDesc = buffer2dDescType{}; + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "The product of cluster lengths should be same as BlockSize!"); + static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements"); - static constexpr index_t BlockSize = - blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{}); - static constexpr index_t NumBlocks = - blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{}); - using binop = detail::binop_with_nan_check; + static_assert(buffer_1d_desc.GetElementSize() == BlockSize, + "The buffer size should be the same as BlockSize!"); + + using Accumulation = detail::AccumulateWithNanCheck; - // This interface does not accumulate on indices template - __device__ static void - Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData) + __device__ static void Reduce(BufferType& block_buffer, + AccDataType& accuData, + index_t thread_m_cluster_id, + index_t thread_k_cluster_id) { - const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetReductionZeroVal(); + constexpr auto cluster_len_shift = get_shift(); - index_t offset; - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); - compType opData = type_convert(block_buffer[offset]); + static_for<0, cluster_len_shift, 1>{}([&](auto I) { + constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I()); - binop::calculate(lAccuData, opData); - } - - offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - - block_buffer(offset) = lAccuData; - - __syncthreads(); - - for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2) - { - if(thread_local_id < indOffset) + if(thread_k_cluster_id < indOffset) { + // consider the thread clusters order, ensure the contiguous locations are accessed + // by contiguous Thread-ID index_t offset1 = - blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); + ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple( + thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset(make_tuple( + thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id)); + index_t offset2 = ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple( + (thread_k_cluster_id + indOffset) * MThreadClusterSize + + thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset( + make_tuple(thread_m_cluster_id * KThreadClusterSize + + (thread_k_cluster_id + indOffset))); - index_t offset2 = - blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); - - compType opData1 = type_convert(block_buffer[offset1]); - compType opData2 = type_convert(block_buffer[offset2]); - binop::calculate(opData1, opData2); - block_buffer(offset1) = type_convert(opData1); + AccDataType opData1 = type_convert(block_buffer[offset1]); + AccDataType opData2 = type_convert(block_buffer[offset2]); + Accumulation::Calculate(opData1, opData2); + block_buffer(offset1) = type_convert(opData1); } __syncthreads(); - } + }); - if(thread_local_id == 0) - { - compType tmpVal = type_convert(block_buffer[0]); + index_t offset = ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset( + make_tuple(thread_m_cluster_id * KThreadClusterSize)); - binop::calculate(accuData, tmpVal); - } + accuData = type_convert(block_buffer[offset]); }; +}; + +template +struct PartitionedBlockwiseReductionWithIndexOn1dBuffer +{ + static constexpr auto buffer_1d_desc = Buffer1dDescType{}; + + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "The product of cluster lengths should be same as BlockSize!"); + static_assert(KThreadClusterSize > 1, "Parallel reduction need work on at least two elements"); + + static_assert(buffer_1d_desc.GetElementSize() == BlockSize, + "The buffer size should be the same as BlockSize!"); + + using Accumulation = + detail::AccumulateWithIndexAndNanCheck; // This interface accumulates on both data values and indices template - __device__ static void Reduce2(BufferType& block_buffer, - IdxBufferType& block_indices_buffer, - index_t toReduceBlocks, - compType& accuData, - int& accuIndex) + __device__ static void Reduce(BufferType& block_val_buffer, + IdxBufferType& block_idx_buffer, + AccDataType& accuData, + IndexDataType& accuIndex, + index_t thread_m_cluster_id, + index_t thread_k_cluster_id) { - const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; + constexpr auto cluster_len_shift = get_shift(); - if constexpr(blockIsOneRow) - { - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) + static_for<0, cluster_len_shift, 1>{}([&](auto I) { + constexpr index_t indOffset = 1 << I(); + + if(thread_k_cluster_id % (indOffset * 2) == 0) { - for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) - { - if(thread_local_id % (indOffset * 2) == 0) - { - index_t offset1 = - buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)); - index_t offset2 = buffer2dDesc.CalculateOffset( - make_tuple(otherDimInd, thread_local_id + indOffset)); + // consider the thread clusters order, ensure the contiguous locations are accessed + // by contiguous Thread-ID + index_t offset1 = + ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple( + thread_k_cluster_id * MThreadClusterSize + thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset(make_tuple( + thread_m_cluster_id * KThreadClusterSize + thread_k_cluster_id)); + index_t offset2 = ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple( + (thread_k_cluster_id + indOffset) * MThreadClusterSize + + thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset( + make_tuple(thread_m_cluster_id * KThreadClusterSize + + (thread_k_cluster_id + indOffset))); - compType currVal1 = type_convert(block_buffer[offset1]); - compType currVal2 = type_convert(block_buffer[offset2]); - int currIndex1 = block_indices_buffer[offset1]; - int currIndex2 = block_indices_buffer[offset2]; + AccDataType opData1 = type_convert(block_val_buffer[offset1]); + AccDataType opData2 = type_convert(block_val_buffer[offset2]); + IndexDataType currIndex1 = block_idx_buffer[offset1]; + IndexDataType currIndex2 = block_idx_buffer[offset2]; - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - block_buffer(offset1) = type_convert(currVal1); - block_indices_buffer(offset1) = currIndex1; - } - __syncthreads(); - } + Accumulation::Calculate(opData1, opData2, currIndex1, currIndex2); + block_val_buffer(offset1) = type_convert(opData1); + block_idx_buffer(offset1) = currIndex1; } - if(thread_local_id == 0) - { - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0)); - - compType tmpVal = type_convert(block_buffer[offset]); - int tmpIndex = block_indices_buffer[offset]; - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - } - - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - } - } - else - { - index_t offset; - - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); - compType currVal = type_convert(block_buffer[offset]); - int currIndex = block_indices_buffer[offset]; - - binop::calculate(lAccuData, currVal, lAccuIndex, currIndex); - } - - offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - - block_buffer(offset) = lAccuData; - block_indices_buffer(offset) = lAccuIndex; - __syncthreads(); + }); - for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) - { - if(thread_local_id % (indOffset * 2) == 0) - { - index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - index_t offset2 = - buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); + index_t offset = ReorderThreadClusters + ? buffer_1d_desc.CalculateOffset(make_tuple(thread_m_cluster_id)) + : buffer_1d_desc.CalculateOffset( + make_tuple(thread_m_cluster_id * KThreadClusterSize)); - compType currVal1 = type_convert(block_buffer[offset1]); - compType currVal2 = type_convert(block_buffer[offset2]); - int currIndex1 = block_indices_buffer[offset1]; - int currIndex2 = block_indices_buffer[offset2]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - block_buffer(offset1) = type_convert(currVal1); - block_indices_buffer(offset1) = currIndex1; - } - - __syncthreads(); - } - - if(thread_local_id == 0) - { - compType tmpVal = type_convert(block_buffer[0]); - int tmpIndex = block_indices_buffer[0]; - - binop::calculate(accuData, tmpVal, accuIndex, tmpIndex); - } - } - }; - - template - __device__ static void set_buffer_value(BufferType& block_buffer, compType value) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_buffer(offset) = value; - - __syncthreads(); - } - }; - - // Initialize the block-wise indices buffer, the index for each element in the block-wise - // data buffer is calculated according to its position in the buffer and the global starting - // index - template - __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_indices_buffer(offset) = offset + indexStart; - - __syncthreads(); - } - }; - - // Execute unary operation on the block buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_buffer(offset) = unary_op(block_buffer[offset]); - - __syncthreads(); - } - }; + accuData = type_convert(block_val_buffer[offset]); + accuIndex = block_idx_buffer[offset]; + } }; }; // end of namespace ck diff --git a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp deleted file mode 100644 index 2956606a6b..0000000000 --- a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp +++ /dev/null @@ -1,141 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP -#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" - -namespace ck { - -template -struct ThreadReduce -{ - using compType = typename opReduce::dataType; - - static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); - - static_assert( - std::is_same::value, - "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - - using binop = detail::binop_with_nan_check; - - // This interface does not accumulate on indices - __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(accuData, thread_buffer[I]); }); - }; - - // This interface accumulates on both data values and indices and - // is called by Direct_ThreadWise reduction method at first-time reduction - __device__ static void - Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = I + indexStart; - binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex); - }); - }; - - // Set the elements in the per-thread buffer to a specific value - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - }; -}; - -template -struct ThreadReduceWithIndicesInput -{ - using compType = typename opReduce::dataType; - - static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); - static_assert(IdxBufferType::IsStaticBuffer(), - "Thread-wise reduction needs use StaticBuffer for indices!"); - - static_assert( - std::is_same::value, - "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); - static_assert(std::is_same::value, - "Indices type of StaticBuffer for Thread-wise reduction should be index_t!"); - - static_assert(BufferType::Size() == IdxBufferType::Size(), - "StaticBuffers for data and indices should have the same sizes!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - - using binop = detail::binop_with_nan_check; - - // This interface accumulates on both data values and indices and - // is called by Direct_ThreadWise reduction method at second-time reduction - __device__ static void Reduce(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]); - }); - }; - - // Set the elements in the per-thread buffer to a specific value - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp deleted file mode 100644 index 9687d2d8c8..0000000000 --- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp +++ /dev/null @@ -1,371 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP -#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" - -namespace ck { - -template -struct WarpReduce -{ - using compType = typename opReduce::dataType; - using binop = detail::binop_with_nan_check; - - static_assert(BufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); - static_assert(std::is_same::value, - "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " - "the compType!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - static constexpr bool have_builtin_shuffle = - std::is_same::value || std::is_same::value; - - // This interface does not accumulate on indices - __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) - { - if constexpr(have_builtin_shuffle) - ReduceImpl1(thread_buffer, accuData); - else - ReduceImpl2(thread_buffer, accuData); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = warpSize / 2; stride > 0; stride /= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - binop::calculate(lAccuData, tmpVal); - __all(1); - } - - binop::calculate(accuData, lAccuData); - }; - - // This interface implementation does not use HIP built-in device shuffling functions - // since for fp16, built-in shuffling functions is not provided by HIP - __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); - - __syncthreads(); - - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - __shared__ compType shuffle_buffer[BlockSize]; - - compType* myBuffer = &shuffle_buffer[warpId * warpSize]; - - myBuffer[thread_inwarp_id] = lAccuData; - - __syncthreads(); - - for(index_t stride = warpSize / 2; stride > 0; stride /= 2) - { - if(thread_inwarp_id < stride) - { - compType currVal1 = myBuffer[thread_inwarp_id]; - compType currVal2 = myBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2); - - myBuffer[thread_inwarp_id] = currVal1; - } - - __syncthreads(); - } - if(thread_inwarp_id == 0) - binop::calculate(accuData, myBuffer[0]); - }; - - // This interface accumulates on both data values and indices and is called by Direct_WarpWise - // reduction method at first-time reduction - __device__ static void - Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) - { - if constexpr(have_builtin_shuffle) - Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart); - else - Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void Reduce2Impl1(const BufferType& thread_buffer, - compType& accuData, - int& accuIndex, - int indexStart) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); - }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - __all(1); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - }; - - // This interface implementation does not use HIP built-in device shuffling functions since for - // fp16, built-in shuffling functions is not provided by HIP - __device__ static void Reduce2Impl2(const BufferType& thread_buffer, - compType& accuData, - int& accuIndex, - int indexStart) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); - }); - - __shared__ compType shuffle_data_buffer[BlockSize]; - __shared__ int shuffle_indices_buffer[BlockSize]; - - compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; - int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; - - myDataBuffer[thread_inwarp_id] = lAccuData; - myIndicesBuffer[thread_inwarp_id] = lAccuIndex; - - __syncthreads(); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType currVal1 = myDataBuffer[thread_inwarp_id]; - compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; - int currIndex1 = myIndicesBuffer[thread_inwarp_id]; - int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - - myDataBuffer[thread_inwarp_id] = currVal1; - myIndicesBuffer[thread_inwarp_id] = currIndex1; - - __syncthreads(); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); - }; - - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - - __all(1); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - - __all(1); - }; -}; - -template -struct WarpReduceWithIndicesInput -{ - using compType = typename opReduce::dataType; - using binop = detail::binop_with_nan_check; - - static_assert(BufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); - static_assert(IdxBufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!"); - - static_assert(std::is_same::value, - "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " - "the compType!"); - static_assert( - std::is_same::value, - "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!"); - - static_assert(BufferType::Size() == IdxBufferType::Size(), - "StaticBuffers for data and indices should have the same sizes!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - static constexpr bool have_builtin_shuffle = - std::is_same::value || std::is_same::value; - - // This interface accumulates on both data values and indices and is called by Direct_WarpWise - // reduction method at second-time reduction - __device__ static void Reduce(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - if constexpr(have_builtin_shuffle) - ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex); - else - ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void ReduceImpl1(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); - }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - __all(1); - } - - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - }; - - // This interface implementation does not use HIP built-in device shuffling functions - // since for fp16, built-in shuffling functions is not provided by HIP - __device__ static void ReduceImpl2(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); - }); - - __shared__ compType shuffle_data_buffer[BlockSize]; - __shared__ int shuffle_indices_buffer[BlockSize]; - - compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; - int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; - - myDataBuffer[thread_inwarp_id] = lAccuData; - myIndicesBuffer[thread_inwarp_id] = lAccuIndex; - - __syncthreads(); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType currVal1 = myDataBuffer[thread_inwarp_id]; - compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; - int currIndex1 = myIndicesBuffer[thread_inwarp_id]; - int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - - myDataBuffer[thread_inwarp_id] = currVal1; - myIndicesBuffer[thread_inwarp_id] = currIndex1; - - __syncthreads(); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); - }; - - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - - __all(1); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - - __all(1); - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/utility/math_v2.hpp b/composable_kernel/include/utility/math_v2.hpp new file mode 100644 index 0000000000..25604149d4 --- /dev/null +++ b/composable_kernel/include/utility/math_v2.hpp @@ -0,0 +1,16 @@ +#ifndef CK_MATH_V2_HPP +#define CK_MATH_V2_HPP + +#include "data_type.hpp" + +namespace ck { +namespace math { + +static inline __device__ half_t abs(half_t x) { return __habs(x); }; +static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); }; +static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; + +} // namespace math +} // namespace ck + +#endif diff --git a/composable_kernel/include/utility/reduction_common.hpp b/composable_kernel/include/utility/reduction_common.hpp index ff574c315c..0cf6d31ed6 100644 --- a/composable_kernel/include/utility/reduction_common.hpp +++ b/composable_kernel/include/utility/reduction_common.hpp @@ -48,6 +48,18 @@ struct float_equal_zero }; }; +template +static constexpr __device__ index_t get_shift() +{ + return (get_shift() + 1); +}; + +template <> +constexpr __device__ index_t get_shift<1>() +{ + return (0); +} + }; // end of namespace ck #endif diff --git a/composable_kernel/include/utility/reduction_functions_binop.hpp b/composable_kernel/include/utility/reduction_functions_accumulate.hpp similarity index 51% rename from composable_kernel/include/utility/reduction_functions_binop.hpp rename to composable_kernel/include/utility/reduction_functions_accumulate.hpp index 5285abee81..4e8636e5b2 100644 --- a/composable_kernel/include/utility/reduction_functions_binop.hpp +++ b/composable_kernel/include/utility/reduction_functions_accumulate.hpp @@ -34,50 +34,79 @@ namespace ck { namespace detail { -static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; +template +static inline __device__ bool is_nan(T x) +{ + return (isnan(x)); +}; -template -struct binop_with_nan_check; +template <> +inline __device__ bool is_nan(half_t x) +{ + return (__hisnan(x)); +}; -template -struct binop_with_nan_check +template +struct AccumulateWithNanCheck; + +template +struct AccumulateWithNanCheck { // cppcheck-suppress constParameter - __device__ static inline void calculate(compType& accuVal, compType currVal) + __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal) { - opReduce{}(accuVal, currVal); + ReduceOperation{}(accuVal, currVal); }; +}; - // The method is called when the opReduce is indexable and the user asked for indices +template +struct AccumulateWithNanCheck +{ + __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal) + { + if(is_nan(currVal)) + { + accuVal = currVal; + } + else + { + ReduceOperation{}(accuVal, currVal); + }; + }; +}; + +template +struct AccumulateWithIndexAndNanCheck; + +template +struct AccumulateWithIndexAndNanCheck +{ __device__ static inline void // cppcheck-suppress constParameter - calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) + Calculate(AccDataType& accuVal, + AccDataType currVal, + IndexDataType& accuIndex, + IndexDataType currIndex) { bool changed = false; - opReduce{}(accuVal, currVal, changed); + ReduceOperation{}(accuVal, currVal, changed); if(changed) accuIndex = currIndex; }; }; -template -struct binop_with_nan_check +template +struct AccumulateWithIndexAndNanCheck { - __device__ static inline void calculate(compType& accuVal, compType currVal) + // The method is called when the ReduceOperation is indexable and the user asked for indices + __device__ static inline void Calculate(AccDataType& accuVal, + AccDataType currVal, + IndexDataType& accuIndex, + IndexDataType currIndex) { - if(isnan(currVal)) - accuVal = currVal; - else - opReduce{}(accuVal, currVal); - }; - - // The method is called when the opReduce is indexable and the user asked for indices - __device__ static inline void - calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) - { - if(isnan(currVal)) + if(is_nan(currVal)) { accuVal = currVal; accuIndex = currIndex; @@ -86,7 +115,7 @@ struct binop_with_nan_check { bool changed = false; - opReduce{}(accuVal, currVal, changed); + ReduceOperation{}(accuVal, currVal, changed); if(changed) accuIndex = currIndex; diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index 15538b9920..5893f60547 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -26,7 +26,7 @@ #ifndef CK_REDUCTION_OPERATOR_HPP #define CK_REDUCTION_OPERATOR_HPP -#include "reduction_common.hpp" +#include "common_header.hpp" namespace ck { @@ -60,11 +60,9 @@ struct Add { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; + __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; - __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } - - static constexpr bool indexable = false; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } }; template @@ -72,11 +70,9 @@ struct Mul { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return static_cast(1.0f); }; + __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(1.0f); }; - __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } - - static constexpr bool indexable = false; + __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } }; template @@ -84,15 +80,18 @@ struct Max { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Lowest(); }; + __host__ __device__ static constexpr T GetReductionZeroVal() + { + return NumericLimits::Lowest(); + }; - __device__ inline constexpr void operator()(T& a, T b) const + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a < b) a = b; } - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { if(a < b) { @@ -100,8 +99,6 @@ struct Max changed = true; } } - - static constexpr bool indexable = true; }; template @@ -109,15 +106,18 @@ struct Min { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Max(); }; + __host__ __device__ static constexpr T GetReductionZeroVal() + { + return NumericLimits::Max(); + }; - __device__ inline constexpr void operator()(T& a, T b) const + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a > b) a = b; } - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { if(a > b) { @@ -125,8 +125,6 @@ struct Min changed = true; } } - - static constexpr bool indexable = true; }; template @@ -134,15 +132,15 @@ struct AMax { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; + __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; - __device__ inline constexpr void operator()(T& a, T b) const + __host__ __device__ inline constexpr void operator()(T& a, T b) const { if(a < b) a = b; } - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const { if(a < b) { @@ -150,270 +148,10 @@ struct AMax changed = true; } } - - static constexpr bool indexable = true; -}; - -// Unary operators are usually called element-wisely before the reduction is executed on the -// elements. -// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 -template -struct unary_identic -{ - __device__ unary_identic(const int divider = 1) - { - scaler = 1.0f / static_cast(divider); - }; - - __device__ inline constexpr T operator()(T a) const { return a * type_convert(scaler); }; - - float scaler = 1.0f; -}; - -template -struct unary_identic -{ - __device__ unary_identic(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return a; }; -}; - -template -struct unary_square -{ - __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline constexpr T operator()(T a) const - { - a = a * a; - - return a * type_convert(scaler); - }; - - float scaler = 1.0f; -}; - -template -struct unary_square -{ - __device__ unary_square(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return a * a; }; -}; - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline constexpr T operator()(T a) const - { - a = abs(a); - - return a * type_convert(scaler); - }; - - float scaler = 1.0f; -}; - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return abs(a); }; -}; - -// We know for sure that 4.0 has __habs(), but 3.0 does not have it. -// Let's assume that __habs() exists since 3.5. -#if HIP_PACKAGE_VERSION_FLAT < 3005000000 -inline __device__ __half __habs(__half x) -{ - union - { - __half half; - unsigned short u16; - } val; - val.half = x; - val.u16 = val.u16 & 0x7fff; - return val.half; -} -#endif - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline half_t operator()(half_t a) const - { - a = static_cast(__habs(a)); - - return a * type_convert(scaler); - }; - - float scaler = 1.0f; -}; - -template <> -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { (void)divider; }; - - __device__ inline half_t operator()(half_t a) const { return static_cast(__habs(a)); }; -}; - -template -struct unary_sqrt -{ - __device__ unary_sqrt(const int divider = 1) { (void)divider; }; - - __device__ inline T operator()(T a) const { return sqrtf(a); }; -}; - -template <> -struct unary_sqrt -{ - __device__ unary_sqrt(const int divider = 1) { (void)divider; }; - - __device__ inline half_t operator()(half_t a) const { return static_cast(hsqrt(a)); }; }; }; // end of namespace reduce -// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their -// respective functor classes. -// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in -// reduce_binary_operactor for -// easier checking by the upper-layer codes in the kernels. - -template -struct reduce_binary_operator; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Mul; - using dataType = T; - - static constexpr bool indexable = reduce::Mul::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Min; - using dataType = T; - - static constexpr bool indexable = reduce::Min::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Max; - using dataType = T; - - static constexpr bool indexable = reduce::Max::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::AMax; - using dataType = T; - - static constexpr bool indexable = reduce::Max::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary -// functor classes. -// The two unary functors are called before and afer the Reduction is executed respectively -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_abs; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_abs; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_square; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_square; - using posUnaryOp = reduce::unary_sqrt; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_sqrt; -}; - } // end of namespace ck #endif diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp deleted file mode 100644 index ca6b415910..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp deleted file mode 100644 index a3daeaf163..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,305 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp deleted file mode 100644 index 81899dfb02..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_multiblock.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - const index_t reduceSizePerBlock = - (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * - copySliceLen; - - if constexpr(src2d_need_padding) - { - const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_dst_global; - (void)indices_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - BlkGroupSize, - alpha, - static_cast(p_src_global), - beta, - static_cast(ws_buf1_global), - static_cast(ws_buf2_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp deleted file mode 100644 index 0e578f4d1d..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_multiblock.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - const index_t reduceSizePerBlock = - (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * - copySliceLen; - - if constexpr(src2d_need_padding) - { - const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_dst_global; - (void)indices_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - BlkGroupSize, - alpha, - static_cast(p_src_global), - beta, - static_cast(ws_buf1_global), - static_cast(ws_buf2_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp deleted file mode 100644 index e63a1254e4..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ /dev/null @@ -1,284 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstdDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp deleted file mode 100644 index 698f740058..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,318 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp deleted file mode 100644 index 4a607372e9..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ /dev/null @@ -1,285 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp deleted file mode 100644 index a641527900..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,320 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp deleted file mode 100644 index 7e9d46612e..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp +++ /dev/null @@ -1,205 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp deleted file mode 100644 index 3f37d01e21..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp deleted file mode 100644 index 77841d1312..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp +++ /dev/null @@ -1,222 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp deleted file mode 100644 index 2de461ad0f..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp deleted file mode 100644 index 1ba5e49657..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp deleted file mode 100644 index aef1545f11..0000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,279 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/device_operation/CMakeLists.txt b/device_operation/CMakeLists.txt index 764b78a122..beae42d316 100644 --- a/device_operation/CMakeLists.txt +++ b/device_operation/CMakeLists.txt @@ -111,7 +111,35 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp; ) +# device_reduce_instance +set(DEVICE_REDUCE_INSTANCE_SOURCE + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f16_f16_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f16_f32_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f32_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f32_f64_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f64_f64_f64.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f16_f16_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f16_f32_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f32_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f32_f64_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f64_f64_f64.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp; + ${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp; +) + add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) +add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE}) add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) @@ -120,8 +148,8 @@ add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURC add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) -add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE}) add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE}) +add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) target_include_directories(device_gemm_instance SYSTEM PUBLIC $) target_include_directories(device_gemm_bias_2d_instance SYSTEM PUBLIC $) @@ -134,6 +162,7 @@ target_include_directories(device_conv2d_fwd_bias_relu_instance SYSTEM PUBLIC $< target_include_directories(device_conv2d_fwd_bias_relu_add_instance SYSTEM PUBLIC $) target_include_directories(device_conv2d_fwd_bias_relu_atomic_add_instance SYSTEM PUBLIC $) target_include_directories(device_conv2d_bwd_data_instance SYSTEM PUBLIC $) +target_include_directories(device_reduce_instance SYSTEM PUBLIC $) target_compile_features(device_gemm_instance PUBLIC) target_compile_features(device_gemm_bias_2d_instance PUBLIC) @@ -146,6 +175,7 @@ target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC) target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC) target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC) target_compile_features(device_conv2d_bwd_data_instance PUBLIC) +target_compile_features(device_reduce_instance PUBLIC) set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_bias_2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -158,6 +188,7 @@ set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_I set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) install(TARGETS device_gemm_instance LIBRARY DESTINATION lib) install(TARGETS device_gemm_bias_2d_instance LIBRARY DESTINATION lib) @@ -170,3 +201,4 @@ install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) +install(TARGETS device_reduce_instance LIBRARY DESTINATION lib) diff --git a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp index f2a56396b6..26b1919b67 100644 --- a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp +++ b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp @@ -549,8 +549,11 @@ struct Conv_N_{N}, Conv_K_{K}, Conv_C_{C}, + input_spatial_lengths_{input_spatial_lengths}, filter_spatial_lengths_{filter_spatial_lengths}, + output_spatial_lengths_{output_spatial_lengths}, conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -625,8 +628,11 @@ struct index_t Conv_N_; index_t Conv_K_; index_t Conv_C_; + std::vector input_spatial_lengths_; std::vector filter_spatial_lengths_; + std::vector output_spatial_lengths_; std::vector conv_filter_strides_; + std::vector conv_filter_dilations_; std::vector input_left_pads_; std::vector input_right_pads_; }; @@ -638,6 +644,28 @@ struct float Run(const Argument& arg, int nrepeat = 1) { +#if 0 + { + std::cout << DeviceOp{}.GetTypeString() << std::endl; + std::cout << "N " << arg.Conv_N_ << ", " + << "K " << arg.Conv_K_ << ", " + << "C " << arg.Conv_C_ << ", " << std::endl; + std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", " + << arg.filter_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", " + << arg.input_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", " + << arg.output_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Strides " << arg.conv_filter_strides_[0] << ", " + << arg.conv_filter_strides_[1] << ", " << std::endl; + std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", " + << arg.conv_filter_dilations_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", " + << arg.input_left_pads_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", " + << arg.input_right_pads_[1] << ", " << std::endl; + } + { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " @@ -656,6 +684,7 @@ struct std::cout << "arg.c1_grid_desc_m_n_{ " << arg.c1_grid_desc_m_n_.GetLength(I0) << ", " << arg.c1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } +#endif if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, diff --git a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp index 4ee978a7d7..6c31c65fa6 100644 --- a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp +++ b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp @@ -526,8 +526,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X Conv_N_{N}, Conv_K_{K}, Conv_C_{C}, + input_spatial_lengths_{input_spatial_lengths}, filter_spatial_lengths_{filter_spatial_lengths}, + output_spatial_lengths_{output_spatial_lengths}, conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -590,8 +593,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X index_t Conv_N_; index_t Conv_K_; index_t Conv_C_; + std::vector input_spatial_lengths_; std::vector filter_spatial_lengths_; + std::vector output_spatial_lengths_; std::vector conv_filter_strides_; + std::vector conv_filter_dilations_; std::vector input_left_pads_; std::vector input_right_pads_; }; @@ -603,6 +609,28 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X float Run(const Argument& arg, int nrepeat = 1) { +#if 0 + { + std::cout << DeviceOp{}.GetTypeString() << std::endl; + std::cout << "N " << arg.Conv_N_ << ", " + << "K " << arg.Conv_K_ << ", " + << "C " << arg.Conv_C_ << ", " << std::endl; + std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", " + << arg.filter_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", " + << arg.input_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", " + << arg.output_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Strides " << arg.conv_filter_strides_[0] << ", " + << arg.conv_filter_strides_[1] << ", " << std::endl; + std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", " + << arg.conv_filter_dilations_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", " + << arg.input_left_pads_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", " + << arg.input_right_pads_[1] << ", " << std::endl; + } + { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " @@ -618,6 +646,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X std::cout << "arg.c0_grid_desc_m_n_{ " << arg.c0_grid_desc_m_n_.GetLength(I0) << ", " << arg.c0_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } +#endif if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, diff --git a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index 6abc455b39..3280b9ea30 100644 --- a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -498,8 +498,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W Conv_N_{N}, Conv_K_{K}, Conv_C_{C}, + input_spatial_lengths_{input_spatial_lengths}, filter_spatial_lengths_{filter_spatial_lengths}, + output_spatial_lengths_{output_spatial_lengths}, conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { @@ -551,8 +554,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W index_t Conv_N_; index_t Conv_K_; index_t Conv_C_; + std::vector input_spatial_lengths_; std::vector filter_spatial_lengths_; + std::vector output_spatial_lengths_; std::vector conv_filter_strides_; + std::vector conv_filter_dilations_; std::vector input_left_pads_; std::vector input_right_pads_; }; @@ -564,6 +570,28 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W float Run(const Argument& arg, int nrepeat = 1) { +#if 0 + { + std::cout << DeviceOp{}.GetTypeString() << std::endl; + std::cout << "N " << arg.Conv_N_ << ", " + << "K " << arg.Conv_K_ << ", " + << "C " << arg.Conv_C_ << ", " << std::endl; + std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", " + << arg.filter_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", " + << arg.input_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", " + << arg.output_spatial_lengths_[1] << ", " << std::endl; + std::cout << "Strides " << arg.conv_filter_strides_[0] << ", " + << arg.conv_filter_strides_[1] << ", " << std::endl; + std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", " + << arg.conv_filter_dilations_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", " + << arg.input_left_pads_[1] << ", " << std::endl; + std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", " + << arg.input_right_pads_[1] << ", " << std::endl; + } + { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " @@ -598,6 +626,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W .GetLength(I5) << "}" << std::endl; } +#endif if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, diff --git a/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp index 3888e5e9c8..d14736dc57 100644 --- a/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/device_operation/include/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -452,6 +452,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K float Run(const Argument& arg, int nrepeat = 1) { +#if 0 { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " @@ -464,6 +465,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } +#endif if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, diff --git a/device_operation/include/device_pool2d_fwd.hpp b/device_operation/include/device_pool2d_fwd.hpp new file mode 100644 index 0000000000..5dd6aff281 --- /dev/null +++ b/device_operation/include/device_pool2d_fwd.hpp @@ -0,0 +1,38 @@ +#ifndef DEVICE_POOL2D_FWD_HPP +#define DEVICE_POOL2D_FWD_HPP + +#include +#include +#include "device_base.hpp" +#include "reduction_enums.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DevicePool2dFwd : public BaseOperator +{ + virtual std::unique_ptr + MakeArgumentPointer(const void* in_dev, + void* out_dev, + void* out_indices_dev, + ck::index_t N, + ck::index_t C, + std::array input_spatial_lengths, + std::array window_spatial_lengths, + std::array output_spatial_lengths, + std::array window_strides, + std::array input_left_pads, + std::array input_right_pads) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DevicePool2dFwdPtr = std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp b/device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp new file mode 100644 index 0000000000..84593cdb5e --- /dev/null +++ b/device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp @@ -0,0 +1,327 @@ +#ifndef DEVICE_POOL2D_FWD_NHWC_NHWC_HPP +#define DEVICE_POOL2D_FWD_NHWC_NHWC_HPP + +#include +#include +#include "device_pool2d_fwd.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "reduction_operator_mapping.hpp" +#include "gridwise_2d_reduction_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + + using IndexDataType = int32_t; + + using ReduceOperation = typename reduce_binary_operator::opType; + + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + static constexpr bool BetaIsZero = true; + + static constexpr index_t InSrcOutDstVectorDim = + 0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is + // not reduced. + + static constexpr ck::index_t ReduceM_BlockTileSize = + ReduceMThreadClusterSize * ReduceMThreadSliceSize; + static constexpr ck::index_t ReduceK_BlockTileSize = + ReduceKThreadClusterSize * ReduceKThreadSliceSize; + + static auto MakeABGridDescriptor_A_M_K_B_M(ck::index_t N, + ck::index_t C, + std::array input_spatial_lengths, + std::array window_spatial_lengths, + std::array output_spatial_lengths, + std::array window_strides, + std::array input_left_pads, + std::array input_right_pads) + { + const index_t Hi = input_spatial_lengths[0]; + const index_t Wi = input_spatial_lengths[1]; + + const index_t Ho = output_spatial_lengths[0]; + const index_t Wo = output_spatial_lengths[1]; + + const index_t Y = window_spatial_lengths[0]; + const index_t X = window_spatial_lengths[1]; + + const index_t ConvStrideH = window_strides[0]; + const index_t ConvStrideW = window_strides[1]; + + const index_t InLeftPadH = input_left_pads[0]; + const index_t InLeftPadW = input_left_pads[1]; + + const index_t InRightPadH = input_right_pads[0]; + const index_t InRightPadW = input_right_pads[1]; + + const index_t ReduceMRaw = N * Ho * Wo * C; + const index_t ReduceMPad = + math::integer_least_multiple(ReduceMRaw, ReduceM_BlockTileSize) - ReduceMRaw; + + const index_t ReduceKRaw = Y * X; + const index_t ReduceKPad = + math::integer_least_multiple(ReduceKRaw, ReduceK_BlockTileSize) - ReduceKRaw; + + // A[ReduceM, ReduceK] + const auto in_grid_desc_n_hi_wi_c = + make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C)); + + const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor( + in_grid_desc_n_hi_wi_c, + make_tuple(make_pass_through_transform(N), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor( + in_grid_desc_n_hip_wip_c, + make_tuple(make_pass_through_transform(N), + make_embed_transform(make_tuple(Y, Ho), make_tuple(I1, ConvStrideH)), + make_embed_transform(make_tuple(X, Wo), make_tuple(I1, ConvStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); + + const auto in_grid_desc_reducemraw_reducekraw = + transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c, + make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)), + make_merge_transform(make_tuple(Y, X))), + make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor( + in_grid_desc_reducemraw_reducekraw, + make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad), + make_right_pad_transform(ReduceKRaw, ReduceKPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // B[ReduceM] + const auto out_grid_desc_reducemraw = + make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo * C)); + + const auto out_grid_desc_reducem = transform_tensor_descriptor( + out_grid_desc_reducemraw, + make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + + return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem); + } + + using ABGridDescs = decltype( + MakeABGridDescriptor_A_M_K_B_M(1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1})); + + using AGridDesc_M_K = remove_cvref_t; + using BGridDesc_M = remove_cvref_t; + + // TODO + struct Argument : public BaseArgument + { + Argument(const InDataType* p_in_dev, + OutDataType* p_out_dev, + int* p_out_indices_dev, + ck::index_t N, + ck::index_t C, + std::array& input_spatial_lengths, + std::array& window_spatial_lengths, + std::array& output_spatial_lengths, + std::array& window_strides, + std::array& input_left_pads, + std::array& input_right_pads) + : p_in_dev_{p_in_dev}, + p_out_dev_{p_out_dev}, + p_out_indices_dev_{p_out_indices_dev}, + a_grid_desc_m_k_{}, + b_grid_desc_m_{} + { + const auto descs = MakeABGridDescriptor_A_M_K_B_M(N, + C, + input_spatial_lengths, + window_spatial_lengths, + output_spatial_lengths, + window_strides, + input_left_pads, + input_right_pads); + + a_grid_desc_m_k_ = descs[I0]; + b_grid_desc_m_ = descs[I1]; + + invariant_lowest_length_ = C; + reduce_lowest_length_ = window_spatial_lengths[1]; + + // TODO: is this correct? + if constexpr(ReduceOpId == ck::ReduceTensorOp_t::AVG) + { + ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1]; + in_element_op_ = InElementwiseOperation{divider}; + acc_element_op_ = AccElementwiseOperation{divider}; + } + } + + const InDataType* p_in_dev_; + OutDataType* p_out_dev_; + int* p_out_indices_dev_; + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_M b_grid_desc_m_; + InElementwiseOperation in_element_op_; + AccElementwiseOperation acc_element_op_; + + // for checking vector load/store + ck::index_t invariant_lowest_length_; + ck::index_t reduce_lowest_length_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise; + + const auto kernel = kernel_reduce_threadwise; + + ck::index_t ReduceM = arg.a_grid_desc_m_k_.GetLength(I0); + + const index_t grid_size = (ReduceM / ReduceM_BlockTileSize); + + return launch_and_time_kernel(kernel, + nrepeat, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.a_grid_desc_m_k_, + arg.b_grid_desc_m_, + arg.in_element_op_, + arg.acc_element_op_, + float(1), + arg.p_in_dev_, + float(0), + arg.p_out_dev_, + arg.p_out_indices_dev_); + } + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + } + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if(pArg->invariant_lowest_length_ % InSrcOutDstVectorSize != 0) + { + return (false); + } + + return (true); + } + + std::unique_ptr + MakeArgumentPointer(const void* p_in_dev, + void* p_out_dev, + void* p_out_indices_dev, + ck::index_t N, + ck::index_t C, + std::array input_spatial_lengths, + std::array window_spatial_lengths, + std::array output_spatial_lengths, + std::array window_strides, + std::array input_left_pads, + std::array input_right_pads) override + { + return std::make_unique(static_cast(p_in_dev), + static_cast(p_out_dev), + static_cast(p_out_indices_dev), + N, + C, + input_spatial_lengths, + window_spatial_lengths, + output_spatial_lengths, + window_strides, + input_left_pads, + input_right_pads); + } + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<" << BlockSize << ","; + str << "M_C" << ReduceMThreadClusterSize << "_S" << ReduceMThreadSliceSize << ","; + str << "K_C" << ReduceKThreadClusterSize << "_S" << ReduceKThreadSliceSize << ","; + str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; // namespace device + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce.hpp b/device_operation/include/device_reduce.hpp new file mode 100644 index 0000000000..97f4d1ad08 --- /dev/null +++ b/device_operation/include/device_reduce.hpp @@ -0,0 +1,58 @@ +#ifndef DEVICE_REDUCE_HPP +#define DEVICE_REDUCE_HPP + +#include +#include +#include + +#include "common_header.hpp" +#include "device_base.hpp" +#include "reduction_enums.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduce : public BaseOperator +{ + virtual size_t GetWorkspaceSizeInBytes(const std::vector& inLengths) + { + (void)inLengths; + + return (0); + }; + + virtual bool HasFurtherCall() { return (false); }; + + virtual std::vector GetWorkspace2dLengths(const BaseArgument* argPtr) + { + (void)argPtr; + return (std::vector{0, 0}); + }; + + virtual std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& inElementwiseOp, + const AccElementwiseOperation& accElementwiseOp) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceReducePtr = + std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_blockwise.hpp b/device_operation/include/device_reduce_blockwise.hpp new file mode 100644 index 0000000000..2ddd8dfb20 --- /dev/null +++ b/device_operation/include/device_reduce_blockwise.hpp @@ -0,0 +1,354 @@ +#ifndef DEVICE_REDUCE_BLOCKWISE_HPP +#define DEVICE_REDUCE_BLOCKWISE_HPP + +#include +#include +#include "device.hpp" +#include "device_reduce.hpp" +#include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduceBlockWise : public DeviceReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + using IndexDataType = int32_t; + + static constexpr bool BetaIsZero = NeedIndices; + + using InvariantDims = decltype(get_invariant_dims()); + + static constexpr index_t srcDims = Rank; + static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + + static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides) + { + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDims) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + const auto toReduceDimLengths = + make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + + const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + + auto in_grid_desc_m_k_padded = + transform_tensor_descriptor(in_grid_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, inPad_M), + make_right_pad_transform(innerLen, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) + { + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + + const auto inPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(outerLen, inPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + struct Argument : public BaseArgument + { + Argument(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const InDataType* in_dev, + OutDataType* out_dev, + IndexDataType* out_indices_dev, + AccDataType* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) + : in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} + { + (void)workspace_dev; + + inLengths_ = inLengths; + inStrides_ = inStrides; + outLengths_ = outLengths; + outStrides_ = outStrides; + + in_elementwise_op_ = in_elementwise_op; + acc_elementwise_op_ = acc_elementwise_op; + + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths); + + if constexpr(InvariantDims::Size() == 0) + invariant_lowest_length = 1; + else + invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; + + reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize; + } + + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; + + AccDataType alpha_; + OutDataType beta_; + + const InDataType* in_dev_; + OutDataType* out_dev_; + IndexDataType* out_indices_dev_; + + InElementwiseOperation in_elementwise_op_; + AccElementwiseOperation acc_elementwise_op_; + + int invariant_lowest_length; + int reduce_lowest_length; + size_t invariant_total_length; + size_t reduce_total_length; + + size_t gridSize; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + const auto in_grid_desc_m_k = + DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_); + const auto out_grid_desc_m = + DeviceReduceBlockWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_); + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using OutGridDesc_M = decltype(out_grid_desc_m); + + using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise; + + float avg_time = 0; + + const auto kernel = kernel_reduce_blockwise; + + avg_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.alpha_, + arg.in_dev_, + arg.beta_, + arg.out_dev_, + nullptr, + arg.out_indices_dev_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(InvariantDims::Size() == 0) + return (false); + + if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) + return (false); + + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + } + else + { + if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + return (false); + + if(pArg->reduce_lowest_length % InSrcVectorSize != 0) + return (false); + }; + + // To improve + if(pArg->invariant_lowest_length % OutDstVectorSize != 0) + return (false); + + // cases with very small reduce_total_length should be handled by the ThreadWise method + if(pArg->reduce_total_length / KThreadSliceSize < 2) + return (false); + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + alpha, + beta, + static_cast(in_dev), + static_cast(out_dev), + static_cast(out_indices_dev), + static_cast(workspace_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReduceBlockWise<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_blockwise_second_call.hpp b/device_operation/include/device_reduce_blockwise_second_call.hpp new file mode 100644 index 0000000000..5eb5c13dc6 --- /dev/null +++ b/device_operation/include/device_reduce_blockwise_second_call.hpp @@ -0,0 +1,317 @@ +#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP +#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP + +#include +#include +#include "device.hpp" +#include "device_reduce.hpp" +#include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduceBlockWiseSecondCall + : public DeviceReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + using IndexDataType = int32_t; + + static constexpr bool BetaIsZero = NeedIndices; + + static_assert( + std::is_same::value, + "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"); + + using InvariantDims = decltype(get_invariant_dims()); + + static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + + static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides) + { + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<2>{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<2>{}); + + const auto in_grid_desc_m_k = + make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + + const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + + auto in_grid_desc_m_k_padded = + transform_tensor_descriptor(in_grid_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, inPad_M), + make_right_pad_transform(innerLen, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) + { + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + + const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(outerLen, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + struct Argument : public BaseArgument + { + Argument(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const InDataType* in_dev, + OutDataType* out_dev, + IndexDataType* out_indices_dev, + AccDataType* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) + : in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} + { + inLengths_ = inLengths; + inStrides_ = inStrides; + outLengths_ = outLengths; + outStrides_ = outStrides; + + in_elementwise_op_ = in_elementwise_op; + acc_elementwise_op_ = acc_elementwise_op; + + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + + invariant_total_length = inLengths[0]; + reduce_total_length = inLengths[1]; + + invariant_lowest_length = inLengths[0]; + reduce_lowest_length = inLengths[1]; + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize; + + size_t ws_buf2_bytes_offset = math::integer_least_multiple( + invariant_total_length * reduce_total_length * sizeof(AccDataType), 64); + + if constexpr(NeedIndices) + workspace_indices_dev_ = reinterpret_cast( + reinterpret_cast(workspace_dev) + ws_buf2_bytes_offset); + else + workspace_indices_dev_ = nullptr; + } + + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; + + AccDataType alpha_; + OutDataType beta_; + + const InDataType* in_dev_; + OutDataType* out_dev_; + IndexDataType* out_indices_dev_; + IndexDataType* workspace_indices_dev_; + + InElementwiseOperation in_elementwise_op_; + AccElementwiseOperation acc_elementwise_op_; + + int invariant_lowest_length; + int reduce_lowest_length; + size_t invariant_total_length; + size_t reduce_total_length; + + size_t gridSize; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_); + const auto out_grid_desc_m = DeviceReduceBlockWiseSecondCall::MakeDst1dDescriptor( + arg.outLengths_, arg.outStrides_); + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using OutGridDesc_M = decltype(out_grid_desc_m); + + using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise; + + float avg_time = 0; + + const auto kernel = kernel_reduce_blockwise_second_call; + + avg_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.alpha_, + arg.in_dev_, + arg.beta_, + arg.out_dev_, + arg.workspace_indices_dev_, + arg.out_indices_dev_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(InSrcVectorDim == 0) + return (false); + + if(pArg->reduce_lowest_length % InSrcVectorSize != 0) + return (false); + + // To improve + if(pArg->invariant_lowest_length % OutDstVectorSize != 0) + return (false); + + // cases with very small reduce_total_length should be handled by the ThreadWise method + if(pArg->reduce_total_length / KThreadSliceSize < 2) + return (false); + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + alpha, + beta, + static_cast(in_dev), + static_cast(out_dev), + static_cast(out_indices_dev), + static_cast(workspace_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReduceBlockWiseSecondCall<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_common.hpp b/device_operation/include/device_reduce_common.hpp new file mode 100644 index 0000000000..bfa84fe0af --- /dev/null +++ b/device_operation/include/device_reduce_common.hpp @@ -0,0 +1,81 @@ +#ifndef DEVICE_REDUCE_COMMON_HPP +#define DEVICE_REDUCE_COMMON_HPP + +#include + +#include "common_header.hpp" +#include "reduction_enums.hpp" +#include "reduction_operator.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// template +// using DeviceReducePtr = std::unique_ptr>; + +template +std::pair get_2d_lengths(const std::vector& inLengths) +{ + static_assert(Rank <= 6, "bigger Rank size not supported!"); + + size_t tensor_total_length = 1; + size_t reduce_total_length = 1; + + static_for<0, ReduceDims::Size(), 1>{}( + [&](auto i) { reduce_total_length *= inLengths[ReduceDims::At(i)]; }); + + static_for<0, Rank, 1>{}([&](auto i) { tensor_total_length *= inLengths[i.value]; }); + + return std::make_pair(tensor_total_length / reduce_total_length, reduce_total_length); +}; + +template +constexpr bool belong() +{ + bool inside = false; + + static_for<0, Seq::Size(), 1>{}([&](auto i) { inside = (inside || (x == Seq::At(i))); }); + + return (inside); +}; + +template +constexpr auto get_invariant_dims() +{ + static_assert(Rank <= 6, "bigger Rank size not supported!"); + + if constexpr(start >= Rank) + return Sequence<>{}; + else + { + if constexpr(!belong()) + return merge_sequences(Sequence{}, + get_invariant_dims()); + else + return get_invariant_dims(); + }; +}; + +// helper functions using variadic template arguments +template +static auto make_tuple_from_array_and_index_seq(const std::vector& lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +static auto make_tuple_from_array(const std::vector& lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +} // namespace device +} // namespace tensor_operation + +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_instance.hpp b/device_operation/include/device_reduce_instance.hpp new file mode 100644 index 0000000000..6fd30b7cb6 --- /dev/null +++ b/device_operation/include/device_reduce_instance.hpp @@ -0,0 +1,28 @@ +#ifndef DEVICE_REDUCE_INSTANTCE_HPP +#define DEVICE_REDUCE_INSTANTCE_HPP + +#include "device_reduce_instance_blockwise_f16_f16_f16.hpp" +#include "device_reduce_instance_blockwise_f16_f32_f16.hpp" +#include "device_reduce_instance_blockwise_f32_f32_f32.hpp" +#include "device_reduce_instance_blockwise_f32_f64_f32.hpp" +#include "device_reduce_instance_blockwise_f64_f64_f64.hpp" +#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp" +#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp" +#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp" +#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp" +#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp" +#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" +#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" +#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp" +#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp" +#include "device_reduce_instance_threadwise_f16_f16_f16.hpp" +#include "device_reduce_instance_threadwise_f16_f32_f16.hpp" +#include "device_reduce_instance_threadwise_f32_f32_f32.hpp" +#include "device_reduce_instance_threadwise_f32_f64_f32.hpp" +#include "device_reduce_instance_threadwise_f64_f64_f64.hpp" + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise.hpp b/device_operation/include/device_reduce_instance_blockwise.hpp new file mode 100644 index 0000000000..9dd6a749b5 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise.hpp @@ -0,0 +1,168 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP + +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_impl_common.hpp" +#include "device_reduce_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +#ifdef QUICK_REDUCE_TEST +using reduce_configuration_2_instances_blockwise = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 2, 2, 2, 1>, + ReductionConfiguration_2<0, 1, 1, 2, 1>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + ReductionConfiguration_2<1, 2, 2, 1, 2>, + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<1, 1, 1, 1, 3> + // clang-format on + >; +#else +using reduce_configuration_2_instances_blockwise = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 4, 4, 8, 1>, + ReductionConfiguration_2<0, 4, 4, 4, 1>, + ReductionConfiguration_2<0, 2, 2, 2, 1>, + + ReductionConfiguration_2<1, 4, 1, 1, 8>, + ReductionConfiguration_2<1, 4, 1, 1, 4>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + + // special instances + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<0, 1, 1, 5, 1>, + ReductionConfiguration_2<0, 1, 1, 7, 1>, + ReductionConfiguration_2<0, 1, 1, 11, 1>, + + ReductionConfiguration_2<1, 1, 1, 1, 3>, + ReductionConfiguration_2<1, 1, 1, 1, 5>, + ReductionConfiguration_2<1, 1, 1, 1, 7>, + ReductionConfiguration_2<1, 1, 1, 1, 11> + // clang-format on + >; +#endif + +template +using deviceReduceBlockWisePtrType = DeviceReducePtr< + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; + +template +void add_device_reduce_instance_blockwise( + std::vector>& device_op_instances) +{ + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + constexpr bool Indexable = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + using cfg1 = + remove_cvref_t(reduce_configuration_1_instances{}))>; + + static_for<0, std::tuple_size::value, 1>{}( + [&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_blockwise{}))>; + + using ReduceOpInstance = DeviceReduceBlockWise; + + device_op_instances.push_back( + std::make_unique(ReduceOpInstance{})); + }); + }); +}; + +#define ADD_BLOCKWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + template void add_device_reduce_instance_blockwise, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector> & device_op_instances) + +#define ADD_BLOCKWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_BLOCKWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + extern template void add_device_reduce_instance_blockwise, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector::InElementwiseOperation, \ + typename reduce_unary_operator:: \ + AccElementwiseOperation>> & \ + device_op_instances) + +#define ADD_BLOCKWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_f16_f16_f16.hpp b/device_operation/include/device_reduce_instance_blockwise_f16_f16_f16.hpp new file mode 100644 index 0000000000..3adb21eeef --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_f16_f16_f16.hpp @@ -0,0 +1,41 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_f16_f32_f16.hpp b/device_operation/include/device_reduce_instance_blockwise_f16_f32_f16.hpp new file mode 100644 index 0000000000..43f565a110 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_f16_f32_f16.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_f32_f32_f32.hpp b/device_operation/include/device_reduce_instance_blockwise_f32_f32_f32.hpp new file mode 100644 index 0000000000..dca4604e11 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_f32_f32_f32.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_f32_f64_f32.hpp b/device_operation/include/device_reduce_instance_blockwise_f32_f64_f32.hpp new file mode 100644 index 0000000000..aadac10ee1 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_f32_f64_f32.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_f64_f64_f64.hpp b/device_operation/include/device_reduce_instance_blockwise_f64_f64_f64.hpp new file mode 100644 index 0000000000..68a61e67e2 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_f64_f64_f64.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call.hpp new file mode 100644 index 0000000000..8d5e426157 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call.hpp @@ -0,0 +1,167 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP + +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_impl_common.hpp" +#include "device_reduce_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +#ifdef QUICK_REDUCE_TEST +using reduce_configuration_2_instances_blockwise_second_call = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<1, 2, 1, 1, 2>, + ReductionConfiguration_2<1, 2, 2, 1, 2>, + ReductionConfiguration_2<1, 1, 1, 1, 3>, + ReductionConfiguration_2<1, 1, 2, 1, 3> + // clang-format on + >; +#else +using reduce_configuration_2_instances_blockwise_second_call = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<1, 4, 1, 1, 8>, + ReductionConfiguration_2<1, 4, 1, 1, 4>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + + ReductionConfiguration_2<1, 1, 1, 1, 3>, + ReductionConfiguration_2<1, 1, 1, 1, 5>, + ReductionConfiguration_2<1, 1, 1, 1, 7>, + ReductionConfiguration_2<1, 1, 1, 1, 11> + // clang-format on + >; +#endif + +template +using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr< + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; + +template +void add_device_reduce_instance_blockwise_second_call( + std::vector>& + device_op_instances) +{ + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + constexpr bool Indexable = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + + static_assert(std::is_same::value, + "InDataType and AccDataType should be the same to use " + "add_device_reduce_instance_blockwise_second_call!"); + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + using cfg1 = + remove_cvref_t(reduce_configuration_1_instances{}))>; + + static_for<0, + std::tuple_size::value, + 1>{}([&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_blockwise_second_call{}))>; + + using ReduceOpInstance = DeviceReduceBlockWiseSecondCall; + + device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); + }); + }); +}; + +#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + template void add_device_reduce_instance_blockwise_second_call, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector> & \ + device_op_instances) + +#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + extern template void add_device_reduce_instance_blockwise_second_call, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector< \ + DeviceReducePtr:: \ + InElementwiseOperation, \ + typename reduce_unary_operator:: \ + AccElementwiseOperation>> & \ + device_op_instances) + +#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp new file mode 100644 index 0000000000..1283f9d327 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp @@ -0,0 +1,41 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp new file mode 100644 index 0000000000..bec7c604f9 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp new file mode 100644 index 0000000000..e795c37c14 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp new file mode 100644 index 0000000000..90549f20a2 --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp b/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp new file mode 100644 index 0000000000..c348fda6dc --- /dev/null +++ b/device_operation/include/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP +#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_impl_common.hpp b/device_operation/include/device_reduce_instance_impl_common.hpp new file mode 100644 index 0000000000..b25645034c --- /dev/null +++ b/device_operation/include/device_reduce_instance_impl_common.hpp @@ -0,0 +1,55 @@ +#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP +#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +template +struct ReductionConfiguration_1 +{ + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid Configuration!"); + + static constexpr int BlockSize_ = BlockSize; + static constexpr int MThreadClusterSize_ = MThreadClusterSize; + static constexpr int KThreadClusterSize_ = KThreadClusterSize; +}; + +template +struct ReductionConfiguration_2 +{ + static constexpr int InSrcVectorDim_ = InSrcVectorDim; + static constexpr int InSrcVectorSize_ = InSrcVectorSize; + static constexpr int OutDstVectorSize_ = OutDstVectorSize; + static constexpr int MThreadSliceSize_ = MThreadSliceSize; + static constexpr int KThreadSliceSize_ = KThreadSliceSize; +}; + +using reduce_configuration_1_instances = std::tuple< + // clang-format off + // BlockSize | MThreadClusterSize | KThreadClusterSize + ReductionConfiguration_1<256, 128, 2>, + ReductionConfiguration_1<256, 64, 4>, + ReductionConfiguration_1<256, 32, 8>, + ReductionConfiguration_1<256, 16, 16>, + ReductionConfiguration_1<256, 8, 32>, + ReductionConfiguration_1<256, 4, 64>, + ReductionConfiguration_1<256, 2, 128>, + ReductionConfiguration_1<256, 1, 256> + // clang-format on + >; + +#define QUICK_REDUCE_TEST 1 + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_atomic_add.hpp b/device_operation/include/device_reduce_instance_multiblock_atomic_add.hpp new file mode 100644 index 0000000000..3ad9db71a1 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_atomic_add.hpp @@ -0,0 +1,192 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP + +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_impl_common.hpp" +#include "device_reduce_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +#ifdef QUICK_REDUCE_TEST +using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 2, 2, 2, 1>, + ReductionConfiguration_2<0, 1, 1, 2, 1>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + ReductionConfiguration_2<1, 2, 2, 1, 2>, + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<1, 1, 1, 1, 3> + // clang-format on + >; +#else +using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 4, 4, 8, 1>, + ReductionConfiguration_2<0, 4, 4, 4, 1>, + ReductionConfiguration_2<0, 2, 2, 2, 1>, + + ReductionConfiguration_2<1, 4, 1, 1, 8>, + ReductionConfiguration_2<1, 4, 1, 1, 4>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + + // special instances + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<0, 1, 1, 5, 1>, + ReductionConfiguration_2<0, 1, 1, 7, 1>, + ReductionConfiguration_2<0, 1, 1, 11, 1>, + + ReductionConfiguration_2<1, 1, 1, 1, 3>, + ReductionConfiguration_2<1, 1, 1, 1, 5>, + ReductionConfiguration_2<1, 1, 1, 1, 7>, + ReductionConfiguration_2<1, 1, 1, 1, 11> + // clang-format on + >; +#endif + +template +using deviceReduceMultiBlockAtomicAddPtrType = + DeviceReducePtr:: + InElementwiseOperation, + typename reduce_unary_operator:: + AccElementwiseOperation>; + +template +void add_device_reduce_instance_multiblock_atomic_add( + std::vector>& + device_op_instances) +{ + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + constexpr bool Indexable = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + + static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES, + "AtomicAdd can only be used with reduction operations without indices!"); + + constexpr bool op_acceptable = + (ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL || + ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1); + + constexpr bool out_type_acceptable = + (std::is_same::value || std::is_same::value); + + if constexpr(!op_acceptable || !out_type_acceptable) + return; + else + { + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + using cfg1 = + remove_cvref_t(reduce_configuration_1_instances{}))>; + + static_for< + 0, + std::tuple_size::value, + 1>{}([&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; + + using ReduceOpInstance = DeviceReduceMultiBlockAtomicAdd; + + device_op_instances.push_back( + std::make_unique(ReduceOpInstance{})); + }); + }); + } +}; + +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + template void add_device_reduce_instance_multiblock_atomic_add, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector> & \ + device_op_instances) + +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + extern template void add_device_reduce_instance_multiblock_atomic_add, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector::InElementwiseOperation, \ + typename reduce_unary_operator:: \ + AccElementwiseOperation>> & \ + device_op_instances) + +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp new file mode 100644 index 0000000000..892e2cc279 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp @@ -0,0 +1,29 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp new file mode 100644 index 0000000000..103e0b8eff --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp @@ -0,0 +1,29 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp new file mode 100644 index 0000000000..874e196f73 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp @@ -0,0 +1,29 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce.hpp new file mode 100644 index 0000000000..84d9dbadc1 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce.hpp @@ -0,0 +1,175 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP + +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_impl_common.hpp" +#include "device_reduce_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +#ifdef QUICK_REDUCE_TEST +using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 1, 1, 2, 1>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<1, 1, 1, 1, 3> + // clang-format on + >; +#else +using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 4, 1, 8, 1>, + ReductionConfiguration_2<0, 4, 1, 4, 1>, + ReductionConfiguration_2<0, 2, 1, 2, 1>, + + ReductionConfiguration_2<1, 4, 1, 1, 8>, + ReductionConfiguration_2<1, 4, 1, 1, 4>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + + // special instances + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<0, 1, 1, 5, 1>, + ReductionConfiguration_2<0, 1, 1, 7, 1>, + ReductionConfiguration_2<0, 1, 1, 11, 1>, + + ReductionConfiguration_2<0, 1, 1, 1, 3>, + ReductionConfiguration_2<0, 1, 1, 1, 5>, + ReductionConfiguration_2<0, 1, 1, 1, 7>, + ReductionConfiguration_2<0, 1, 1, 1, 11> + // clang-format on + >; +#endif + +template +using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr< + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; + +template +void add_device_reduce_instance_multiblock_partial_reduce( + std::vector>& + device_op_instances) +{ + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + constexpr bool Indexable = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + using cfg1 = + remove_cvref_t(reduce_configuration_1_instances{}))>; + + static_for< + 0, + std::tuple_size::value, + 1>{}([&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_partial_reduce{}))>; + + using ReduceOpInstance = DeviceReduceMultiBlockPartialReduce; + + device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); + }); + }); +}; + +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + template void add_device_reduce_instance_multiblock_partial_reduce, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector> & \ + device_op_instances) + +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + extern template void \ + add_device_reduce_instance_multiblock_partial_reduce, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector< \ + DeviceReducePtr:: \ + InElementwiseOperation, \ + typename reduce_unary_operator:: \ + AccElementwiseOperation>> & \ + device_op_instances) + +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp new file mode 100644 index 0000000000..3795353a02 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp @@ -0,0 +1,41 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp new file mode 100644 index 0000000000..0e9e0225f3 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp new file mode 100644 index 0000000000..ca7c31b038 --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp @@ -0,0 +1,45 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp new file mode 100644 index 0000000000..a32ac0b30a --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp @@ -0,0 +1,26 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp new file mode 100644 index 0000000000..45acc267ca --- /dev/null +++ b/device_operation/include/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp @@ -0,0 +1,53 @@ +#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP +#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); // + +// Will be moved to use MultiBlockAtomicAdd +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise.hpp b/device_operation/include/device_reduce_instance_threadwise.hpp new file mode 100644 index 0000000000..fdb46207c4 --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise.hpp @@ -0,0 +1,164 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP + +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_impl_common.hpp" +#include "device_reduce_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +#ifdef QUICK_REDUCE_TEST +using reduce_configuration_2_instances_threadwise = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 2, 2, 2, 1>, + ReductionConfiguration_2<0, 1, 1, 2, 1>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + ReductionConfiguration_2<1, 2, 2, 1, 2>, + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<1, 1, 1, 1, 3> + // clang-format on + >; +#else +using reduce_configuration_2_instances_threadwise = std::tuple< + // clang-format off + // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize + ReductionConfiguration_2<0, 4, 4, 8, 1>, + ReductionConfiguration_2<0, 4, 4, 4, 1>, + ReductionConfiguration_2<0, 2, 2, 2, 1>, + + ReductionConfiguration_2<1, 4, 1, 1, 8>, + ReductionConfiguration_2<1, 4, 1, 1, 4>, + ReductionConfiguration_2<1, 2, 1, 1, 2>, + + // special instances + ReductionConfiguration_2<0, 1, 1, 3, 1>, + ReductionConfiguration_2<0, 1, 1, 5, 1>, + ReductionConfiguration_2<0, 1, 1, 7, 1>, + ReductionConfiguration_2<0, 1, 1, 11, 1>, + + ReductionConfiguration_2<1, 1, 1, 1, 3>, + ReductionConfiguration_2<1, 1, 1, 1, 5>, + ReductionConfiguration_2<1, 1, 1, 1, 7>, + ReductionConfiguration_2<1, 1, 1, 1, 11> + // clang-format on + >; +#endif + +template +using deviceReduceThreadWisePtrType = DeviceReducePtr< + typename reduce_unary_operator::InElementwiseOperation, + typename reduce_unary_operator::AccElementwiseOperation>; + +template +void add_device_reduce_instance_threadwise( + std::vector>& device_op_instances) +{ + using ReduceOperation = typename reduce_binary_operator::opType; + using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename reduce_unary_operator:: + AccElementwiseOperation; + + constexpr bool Indexable = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + + using cfg1 = ReductionConfiguration_1<256, 256, 1>; + + static_for<0, std::tuple_size::value, 1>{}( + [&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_threadwise{}))>; + + using ReduceOpInstance = DeviceReduceThreadWise; + + device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); + }); +}; + +#define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + template void add_device_reduce_instance_threadwise, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector> & device_op_instances) + +#define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_THREADWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +#define ADD_THREADWISE_INST_REF_BY_TYPE( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + extern template void add_device_reduce_instance_threadwise, \ + ReduceOpId, \ + NanOpt, \ + IndicesOpt>( \ + std::vector::InElementwiseOperation, \ + typename reduce_unary_operator:: \ + AccElementwiseOperation>> & \ + device_op_instances) + +#define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ + ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ + __VA_ARGS__) + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise_f16_f16_f16.hpp b/device_operation/include/device_reduce_instance_threadwise_f16_f16_f16.hpp new file mode 100644 index 0000000000..34aa7cf09a --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise_f16_f16_f16.hpp @@ -0,0 +1,41 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise_f16_f32_f16.hpp b/device_operation/include/device_reduce_instance_threadwise_f16_f32_f16.hpp new file mode 100644 index 0000000000..343cc07692 --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise_f16_f32_f16.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise_f32_f32_f32.hpp b/device_operation/include/device_reduce_instance_threadwise_f32_f32_f32.hpp new file mode 100644 index 0000000000..626607c575 --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise_f32_f32_f32.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise_f32_f64_f32.hpp b/device_operation/include/device_reduce_instance_threadwise_f32_f64_f32.hpp new file mode 100644 index 0000000000..0ad14d6ae0 --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise_f32_f64_f32.hpp @@ -0,0 +1,32 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_instance_threadwise_f64_f64_f64.hpp b/device_operation/include/device_reduce_instance_threadwise_f64_f64_f64.hpp new file mode 100644 index 0000000000..fdaa10eb00 --- /dev/null +++ b/device_operation/include/device_reduce_instance_threadwise_f64_f64_f64.hpp @@ -0,0 +1,50 @@ +#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP +#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck + +#endif diff --git a/device_operation/include/device_reduce_multiblock_atomic_add.hpp b/device_operation/include/device_reduce_multiblock_atomic_add.hpp new file mode 100644 index 0000000000..e607fe9a5a --- /dev/null +++ b/device_operation/include/device_reduce_multiblock_atomic_add.hpp @@ -0,0 +1,418 @@ +#ifndef DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP +#define DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP + +#include +#include +#include "device.hpp" +#include "device_base.hpp" +#include "device_reduce.hpp" +#include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_multiblock_atomic_add.hpp" +#include "gridwise_set_buffer_value.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduceMultiBlockAtomicAdd + : public DeviceReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + using IndexDataType = int32_t; + + using InvariantDims = decltype(get_invariant_dims()); + + static constexpr index_t srcDims = Rank; + static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + + static constexpr bool support_AtomicAdd = + std::is_same::value || std::is_same::value; + + static_assert(!NeedIndices && support_AtomicAdd, + "MultiBlockAtomicAdd method can only be used with non-indiced operation and when " + "having float/double output type!"); + + static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, + int blkGroupSize, + int kBlockTileIterations) + { + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDims) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + const auto toReduceDimLengths = + make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; + const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen; + + auto in_grid_desc_m_k_padded = + transform_tensor_descriptor(in_grid_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, inPad_M), + make_right_pad_transform(innerLen, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) + { + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + + const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(outerLen, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + struct Argument : public BaseArgument + { + Argument(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const InDataType* in_dev, + OutDataType* out_dev, + IndexDataType* out_indices_dev, + AccDataType* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) + : in_dev_{in_dev}, out_dev_{out_dev} + { + (void)out_indices_dev; + (void)workspace_dev; + + inLengths_ = inLengths; + inStrides_ = inStrides; + outLengths_ = outLengths; + outStrides_ = outStrides; + + in_elementwise_op_ = in_elementwise_op; + acc_elementwise_op_ = acc_elementwise_op; + + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths); + + if constexpr(InvariantDims::Size() == 0) + invariant_lowest_length = 1; + else + invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; + + reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; + + int iterations = 1; + while(true) + { + int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + kBlockTileIterations = iterations; + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize * blkGroupSize; + + gridSize_pre = + math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize; + } + + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; + + AccDataType alpha_; + OutDataType beta_; + + const InDataType* in_dev_; + OutDataType* out_dev_; + + InElementwiseOperation in_elementwise_op_; + AccElementwiseOperation acc_elementwise_op_; + + int invariant_lowest_length; + int reduce_lowest_length; + size_t invariant_total_length; + size_t reduce_total_length; + + index_t blkGroupSize; + index_t kBlockTileIterations; + size_t gridSize; + + size_t gridSize_pre; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); + const auto out_grid_desc_m = DeviceReduceMultiBlockAtomicAdd::MakeDst1dDescriptor( + arg.outLengths_, arg.outStrides_); + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using OutGridDesc_M = decltype(out_grid_desc_m); + + using GridwiseReduce = + GridwiseReduction_mk_to_m_multiblock_atomic_add; + + float avg_time = 0; + + KernelTimer timer; + + const auto kernel_pre = kernel_buffer_set_value; + const auto kernel_main = kernel_reduce_multiblock_atocmi_add; + + printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n", + arg.gridSize, + BlockSize); + printf("Warm up\n"); + + for(int i = 0; i < nrepeat + 1; i++) + { + if(i == 1) + timer.Start(); + + launch_kernel(kernel_pre, + dim3(arg.gridSize_pre), + dim3(BlockSize), + 0, + out_grid_desc_m, + arg.out_dev_, + static_cast(0.0f)); + + launch_kernel(kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.blkGroupSize, + arg.kBlockTileIterations, + arg.alpha_, + arg.in_dev_, + arg.out_dev_); + }; + + timer.End(); + + avg_time = timer.GetElapsedTime() / nrepeat; + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(InvariantDims::Size() == 0) + return (false); + + if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) + return (false); + + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + } + else + { + if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + return (false); + + if(pArg->reduce_lowest_length % InSrcVectorSize != 0) + return (false); + }; + + if(static_cast(pArg->beta_) != 0.0f) + return (false); + + // To improve + if(pArg->invariant_lowest_length % OutDstVectorSize != 0) + return (false); + + // cases with small reduce_total_length should be handled by the BlockWise method + if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize) + return (false); + + // This is very strong restriction, but needed to avoid some failure + if(pArg->invariant_lowest_length % M_BlockTileSize != 0) + return (false); + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + alpha, + beta, + static_cast(in_dev), + static_cast(out_dev), + static_cast(out_indices_dev), + static_cast(workspace_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_multiblock_partial_reduce.hpp b/device_operation/include/device_reduce_multiblock_partial_reduce.hpp new file mode 100644 index 0000000000..ffd294aff7 --- /dev/null +++ b/device_operation/include/device_reduce_multiblock_partial_reduce.hpp @@ -0,0 +1,419 @@ +#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP +#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP + +#include +#include +#include "device.hpp" +#include "device_reduce.hpp" +#include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduceMultiBlockPartialReduce + : public DeviceReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!"); + + using IndexDataType = int32_t; + + using InvariantDims = decltype(get_invariant_dims()); + + static constexpr index_t srcDims = Rank; + static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + + static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + size_t GetWorkspaceSizeInBytes(const std::vector& inLengths) override + { + size_t invariant_total_length; + size_t reduce_total_length; + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths); + + int iterations = 1; + while(true) + { + int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + size_t workspace_size = invariant_total_length * blkGroupSize; + + size_t wsSizeInBytes = + !NeedIndices ? workspace_size * sizeof(AccDataType) + : workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int); + + return (wsSizeInBytes); + }; + + bool HasFurtherCall() override { return (true); }; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, + int blkGroupSize, + int kBlockTileIterations) + { + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDims) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + const auto toReduceDimLengths = + make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations; + const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen; + + auto in_grid_desc_m_k_padded = + transform_tensor_descriptor(in_grid_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, inPad_M), + make_right_pad_transform(innerLen, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize) + { + auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize)); + + const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + + auto ws_desc_m_k_padded = + transform_tensor_descriptor(ws_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, wsPad), + make_pass_through_transform(blkGroupSize)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (ws_desc_m_k_padded); + }; + + struct Argument : public BaseArgument + { + Argument(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const InDataType* in_dev, + OutDataType* out_dev, + IndexDataType* out_indices_dev, + AccDataType* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) + : in_dev_{in_dev}, + out_dev_{out_dev}, + out_indices_dev_{out_indices_dev}, + workspace_dev_{workspace_dev} + { + inLengths_ = inLengths; + inStrides_ = inStrides; + outLengths_ = outLengths; + outStrides_ = outStrides; + + in_elementwise_op_ = in_elementwise_op; + acc_elementwise_op_ = acc_elementwise_op; + + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths); + + if constexpr(InvariantDims::Size() == 0) + invariant_lowest_length = 1; + else + invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; + + reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; + + int iterations = 1; + while(true) + { + int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + kBlockTileIterations = iterations; + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize * blkGroupSize; + + size_t ws_buf2_bytes_offset = math::integer_least_multiple( + invariant_total_length * blkGroupSize * sizeof(AccDataType), 64); + + if constexpr(NeedIndices) + workspace_indices_dev_ = reinterpret_cast( + reinterpret_cast(workspace_dev_) + ws_buf2_bytes_offset); + else + workspace_indices_dev_ = nullptr; + } + + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; + + AccDataType alpha_; + OutDataType beta_; + + const InDataType* in_dev_; + OutDataType* out_dev_; + IndexDataType* out_indices_dev_; + AccDataType* workspace_dev_; + IndexDataType* workspace_indices_dev_; + + InElementwiseOperation in_elementwise_op_; + AccElementwiseOperation acc_elementwise_op_; + + int invariant_lowest_length; + int reduce_lowest_length; + size_t invariant_total_length; + size_t reduce_total_length; + + index_t blkGroupSize; + index_t kBlockTileIterations; + size_t gridSize; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); + const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor( + arg.invariant_total_length, arg.blkGroupSize); + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using WorkspaceDesc_M_K = decltype(ws_desc_m_k); + + using GridwiseReduce = + GridwiseReduction_mk_to_mk_multiblock_partial_reduce; + + float avg_time = 0; + + const auto kernel = kernel_partial_reduce_multiblock; + + avg_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + ws_desc_m_k, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.blkGroupSize, + arg.kBlockTileIterations, + arg.in_dev_, + arg.workspace_dev_, + arg.workspace_indices_dev_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(OutDstVectorSize != 1) + return (false); + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(InvariantDims::Size() == 0) + return (false); + + if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) + return (false); + + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + } + else + { + if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + return (false); + + if(pArg->reduce_lowest_length % InSrcVectorSize != 0) + return (false); + }; + + // cases with small reduce_total_length should be handled by the BlockWise method + if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize) + return (false); + + return (true); + }; + + std::vector GetWorkspace2dLengths(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + return ( + std::vector{static_cast(pArg->invariant_total_length), pArg->blkGroupSize}); + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const AccElementwiseOperation& acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + alpha, + beta, + static_cast(in_dev), + static_cast(out_dev), + static_cast(out_indices_dev), + static_cast(workspace_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/device_reduce_threadwise.hpp b/device_operation/include/device_reduce_threadwise.hpp new file mode 100644 index 0000000000..a16eceaaf9 --- /dev/null +++ b/device_operation/include/device_reduce_threadwise.hpp @@ -0,0 +1,355 @@ +#ifndef DEVICE_REDUCE_THREADWISE_HPP +#define DEVICE_REDUCE_THREADWISE_HPP + +#include +#include +#include "device.hpp" +#include "device_reduce.hpp" +#include "device_reduce_common.hpp" +#include "gridwise_2d_reduction_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceReduceThreadWise : public DeviceReduce +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1), + "Threadwise can only be called with KThreadClusterSize be 1 !"); + + using IndexDataType = int32_t; + + static constexpr bool BetaIsZero = NeedIndices; + + using InvariantDims = decltype(get_invariant_dims()); + + static constexpr index_t srcDims = Rank; + static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size(); + static constexpr bool reduceAllDims = (InvariantDims::Size() == 0); + + static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides) + { + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDims) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + const auto toReduceDimLengths = + make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{}); + + const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen; + + auto in_grid_desc_m_k_padded = + transform_tensor_descriptor(in_grid_desc_m_k, + make_tuple(make_right_pad_transform(outerLen, inPad_M), + make_right_pad_transform(innerLen, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + static auto MakeDst1dDescriptor(const std::vector& outLengths, + const std::vector& outStrides) + { + const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + + auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto out_grid_desc_m = transform_tensor_descriptor( + outDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto outerLen = out_grid_desc_m.GetLength(Number<0>{}); + + const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen; + + auto out_grid_desc_m_padded = + transform_tensor_descriptor(out_grid_desc_m, + make_tuple(make_right_pad_transform(outerLen, outPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (out_grid_desc_m_padded); + }; + + struct Argument : public BaseArgument + { + Argument(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const InDataType* in_dev, + OutDataType* out_dev, + IndexDataType* out_indices_dev, + AccDataType* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const OutElementwiseOperation& acc_elementwise_op) + : in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev} + { + (void)workspace_dev; + + inLengths_ = inLengths; + inStrides_ = inStrides; + outLengths_ = outLengths; + outStrides_ = outStrides; + + in_elementwise_op_ = in_elementwise_op; + acc_elementwise_op_ = acc_elementwise_op; + + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(inLengths); + + if constexpr(InvariantDims::Size() == 0) + invariant_lowest_length = 1; + else + invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)]; + + reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)]; + + gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize; + } + + std::vector inLengths_; + std::vector inStrides_; + std::vector outLengths_; + std::vector outStrides_; + + AccDataType alpha_; + OutDataType beta_; + + const InDataType* in_dev_; + OutDataType* out_dev_; + IndexDataType* out_indices_dev_; + + InElementwiseOperation in_elementwise_op_; + OutElementwiseOperation acc_elementwise_op_; + + int invariant_lowest_length; + int reduce_lowest_length; + size_t invariant_total_length; + size_t reduce_total_length; + + size_t gridSize; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + const auto in_grid_desc_m_k = + DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_); + const auto out_grid_desc_m = + DeviceReduceThreadWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_); + using InGridDesc_M_K = decltype(in_grid_desc_m_k); + using OutGridDesc_M = decltype(out_grid_desc_m); + + using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise; + + float avg_time = 0; + + const auto kernel = kernel_reduce_threadwise; + + avg_time = launch_and_time_kernel(kernel, + nrepeat, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.alpha_, + arg.in_dev_, + arg.beta_, + arg.out_dev_, + arg.out_indices_dev_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if constexpr(InSrcVectorDim == 0) + { + if constexpr(InvariantDims::Size() == 0) + return (false); + + if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1) + return (false); + + if(pArg->invariant_lowest_length % InSrcVectorSize != 0) + return (false); + } + else + { + if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1) + return (false); + + if(pArg->reduce_lowest_length % InSrcVectorSize != 0) + return (false); + }; + + // To improve + if(pArg->invariant_lowest_length % OutDstVectorSize != 0) + return (false); + + // TODO: remove this. Should return true, as long as this DeviceOP instance support this + // case for bigger reduce_total_length size, we are supposed to use BlockWise method for + // better performance + if(pArg->reduce_total_length / KThreadSliceSize >= 32) + return (false); + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector& inLengths, + const std::vector& inStrides, + const std::vector& outLengths, + const std::vector& outStrides, + float alpha, + float beta, + const void* in_dev, + void* out_dev, + void* out_indices_dev, + void* workspace_dev, + const InElementwiseOperation& in_elementwise_op, + const OutElementwiseOperation& acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + alpha, + beta, + static_cast(in_dev), + static_cast(out_dev), + static_cast(out_indices_dev), + static_cast(workspace_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReducceThreadWise<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/device_operation/include/reduction_operator_mapping.hpp b/device_operation/include/reduction_operator_mapping.hpp new file mode 100644 index 0000000000..da896ad75b --- /dev/null +++ b/device_operation/include/reduction_operator_mapping.hpp @@ -0,0 +1,169 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP +#define CK_REDUCTION_OPERATOR_MAPPING_HPP + +#include "reduction_operator.hpp" +#include "reduction_enums.hpp" +#include "element_wise_operation.hpp" + +namespace ck { + +// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their +// respective functor classes. +// The boolean member "indexable" are also provided in reduce_binary_operactor for +// easier checking by the upper-layer codes in the kernels. + +template +struct reduce_binary_operator; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + static constexpr bool indexable = false; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Mul; + using dataType = T; + + static constexpr bool indexable = false; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Min; + using dataType = T; + + static constexpr bool indexable = true; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Max; + using dataType = T; + + static constexpr bool indexable = true; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::AMax; + using dataType = T; + + static constexpr bool indexable = true; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + static constexpr bool indexable = false; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + static constexpr bool indexable = false; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + static constexpr bool indexable = false; +}; + +// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary +// functor classes. +// The two unary functors are called before and afer the Reduction is executed respectively +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; + using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; + using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; +}; + +template +struct reduce_unary_operator +{ + using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; + using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; +}; + +} // end of namespace ck + +#endif diff --git a/device_operation/src/device_reduce_instance_blockwise_f16_f16_f16.cpp b/device_operation/src/device_reduce_instance_blockwise_f16_f16_f16.cpp new file mode 100644 index 0000000000..d471d25806 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_f16_f16_f16.cpp @@ -0,0 +1,34 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_f16_f32_f16.cpp b/device_operation/src/device_reduce_instance_blockwise_f16_f32_f16.cpp new file mode 100644 index 0000000000..df26eb303e --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_f16_f32_f16.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_f32_f32_f32.cpp b/device_operation/src/device_reduce_instance_blockwise_f32_f32_f32.cpp new file mode 100644 index 0000000000..429bdf88a3 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_f32_f32_f32.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_f32_f64_f32.cpp b/device_operation/src/device_reduce_instance_blockwise_f32_f64_f32.cpp new file mode 100644 index 0000000000..36708b908b --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_f32_f64_f32.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_f64_f64_f64.cpp b/device_operation/src/device_reduce_instance_blockwise_f64_f64_f64.cpp new file mode 100644 index 0000000000..861e090af1 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_f64_f64_f64.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp b/device_operation/src/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp new file mode 100644 index 0000000000..cd0c51a275 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp @@ -0,0 +1,34 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp b/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp new file mode 100644 index 0000000000..a64adb633a --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp b/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp new file mode 100644 index 0000000000..5b4d492fef --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp b/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp new file mode 100644 index 0000000000..ff8cf68ce9 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp b/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp new file mode 100644 index 0000000000..ef19a26935 --- /dev/null +++ b/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_blockwise_second_call.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp new file mode 100644 index 0000000000..93cf4773d4 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp @@ -0,0 +1,22 @@ +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp new file mode 100644 index 0000000000..f28284dcba --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp @@ -0,0 +1,22 @@ +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp new file mode 100644 index 0000000000..ae2fd4bdd8 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp @@ -0,0 +1,22 @@ +#include "device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp new file mode 100644 index 0000000000..e5995b9dc0 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp @@ -0,0 +1,34 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp new file mode 100644 index 0000000000..5f966df0f6 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp new file mode 100644 index 0000000000..581cdfea13 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp @@ -0,0 +1,38 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp new file mode 100644 index 0000000000..c1c2bdb3b3 --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp @@ -0,0 +1,19 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp new file mode 100644 index 0000000000..8aec4e96bf --- /dev/null +++ b/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp @@ -0,0 +1,46 @@ +#include "device_reduce_instance_multiblock_partial_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // + +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // + +// Will be moved to use MultiBlockAtomicAdd +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_threadwise_f16_f16_f16.cpp b/device_operation/src/device_reduce_instance_threadwise_f16_f16_f16.cpp new file mode 100644 index 0000000000..ff1f126fac --- /dev/null +++ b/device_operation/src/device_reduce_instance_threadwise_f16_f16_f16.cpp @@ -0,0 +1,34 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_threadwise_f16_f32_f16.cpp b/device_operation/src/device_reduce_instance_threadwise_f16_f32_f16.cpp new file mode 100644 index 0000000000..898eb999cf --- /dev/null +++ b/device_operation/src/device_reduce_instance_threadwise_f16_f32_f16.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_threadwise_f32_f32_f32.cpp b/device_operation/src/device_reduce_instance_threadwise_f32_f32_f32.cpp new file mode 100644 index 0000000000..815c1ac20d --- /dev/null +++ b/device_operation/src/device_reduce_instance_threadwise_f32_f32_f32.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_threadwise_f32_f64_f32.cpp b/device_operation/src/device_reduce_instance_threadwise_f32_f64_f32.cpp new file mode 100644 index 0000000000..e42e22edcf --- /dev/null +++ b/device_operation/src/device_reduce_instance_threadwise_f32_f64_f32.cpp @@ -0,0 +1,25 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/device_operation/src/device_reduce_instance_threadwise_f64_f64_f64.cpp b/device_operation/src/device_reduce_instance_threadwise_f64_f64_f64.cpp new file mode 100644 index 0000000000..bf72f21c7d --- /dev/null +++ b/device_operation/src/device_reduce_instance_threadwise_f64_f64_f64.cpp @@ -0,0 +1,43 @@ +#include "device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims +ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD +ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); +ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); +ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG +ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 +ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // +ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // +// clang-format on + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/example/12_pool2d_fwd/pool2d_fwd.cpp b/example/12_pool2d_fwd/pool2d_fwd.cpp new file mode 100644 index 0000000000..313ba086ff --- /dev/null +++ b/example/12_pool2d_fwd/pool2d_fwd.cpp @@ -0,0 +1,311 @@ +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_reduce_util.hpp" +#include "device_tensor.hpp" +#include "tensor_layout.hpp" +#include "reduction_operator.hpp" +#include "device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using OutLayout = ck::tensor_layout::convolution::NHWC; + +#if 1 +static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX; +#else +static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG; +#endif + +static constexpr bool NeedIndices = false; +static constexpr bool PropagateNan = false; + +using DevicePoolFwdInstance = + ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< + InDataType, // InDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + ReduceOpId, + NeedIndices, + 64, // BlockSize + 64, // ReduceMThreadClusterSize + 1, // ReduceKThreadClusterSize + 4, // ReduceMThreadSliceSize + 1, // ReduceKThreadSliceSize + 4>; // InSrcOutDstVectorSize + +template +static void pool_host_verify(const Tensor& in, + Tensor& out, + Tensor& out_indices, + const std::array& window_spatial_lengths, + const std::array& window_strides, + const std::array& in_left_pads, + const std::array& /*in_right_pads*/) +{ + using namespace ck::host_reduce; + + const int divider = window_spatial_lengths[0] * window_spatial_lengths[1]; + + const auto PreUnaryOp = PreUnaryOpFn(divider); + const auto PosUnaryOp = PosUnaryOpFn(divider); + + if constexpr(!NeedIndices) + { + auto opReduce = ReduceOpFn(); + + auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { + auto accuVal = ReduceOpZeroVal(); + + for(int y = 0; y < window_spatial_lengths[0]; ++y) + { + int hi = ho * window_strides[0] + y - in_left_pads[0]; + for(int x = 0; x < window_spatial_lengths[1]; ++x) + { + int wi = wo * window_strides[1] + x - in_left_pads[1]; + if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && + wi < in.mDesc.GetLengths()[3]) + { + AccDataType currVal = static_cast(in(n, c, hi, wi)); + + PreUnaryOp(currVal); + + binop_with_nan_check(opReduce, accuVal, currVal); + } + } + } + + PosUnaryOp(accuVal); + + out(n, c, ho, wo) = accuVal; + }; + + make_ParallelTensorFunctor(f_nchw, + out.mDesc.GetLengths()[0], + out.mDesc.GetLengths()[1], + out.mDesc.GetLengths()[2], + out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); + } + else + { + auto opReduce = ReduceOpFn2(); + + auto f_nchw = [&](auto n, auto c, auto ho, auto wo) { + auto accuVal = ReduceOpZeroVal(); + int accuIndex = 0; + + for(int y = 0; y < window_spatial_lengths[0]; ++y) + { + int hi = ho * window_strides[0] + y - in_left_pads[0]; + for(int x = 0; x < window_spatial_lengths[1]; ++x) + { + int wi = wo * window_strides[1] + x - in_left_pads[1]; + if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && + wi < in.mDesc.GetLengths()[3]) + { + AccDataType currVal = static_cast(in(n, c, hi, wi)); + int currIndex = y * window_spatial_lengths[1] + x; + + PreUnaryOp(currVal); + + binop_with_nan_check2( + opReduce, accuVal, currVal, accuIndex, currIndex); + } + } + } + + PosUnaryOp(accuVal); + + out(n, c, ho, wo) = accuVal; + out_indices(n, c, ho, wo) = accuIndex; + }; + + make_ParallelTensorFunctor(f_nchw, + out.mDesc.GetLengths()[0], + out.mDesc.GetLengths()[1], + out.mDesc.GetLengths()[2], + out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); + }; +} + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + + // Pool shape + ck::index_t N = 128; + ck::index_t C = 192; + ck::index_t Y = 3; + ck::index_t X = 3; + ck::index_t Hi = 71; + ck::index_t Wi = 71; + ck::index_t window_stride_h = 2; + ck::index_t window_stride_w = 2; + ck::index_t in_left_pad_h = 1; + ck::index_t in_left_pad_w = 1; + ck::index_t in_right_pad_h = 1; + ck::index_t in_right_pad_w = 1; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + else if(argc == 16) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + + N = std::stoi(argv[4]); + C = std::stoi(argv[5]); + Y = std::stoi(argv[6]); + X = std::stoi(argv[7]); + Hi = std::stoi(argv[8]); + Wi = std::stoi(argv[9]); + window_stride_h = std::stoi(argv[10]); + window_stride_w = std::stoi(argv[11]); + in_left_pad_h = std::stoi(argv[12]); + in_left_pad_w = std::stoi(argv[13]); + in_right_pad_h = std::stoi(argv[14]); + in_right_pad_w = std::stoi(argv[15]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: run kernel # of times (>1)\n"); + printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(0); + } + + const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; + const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1; + + const std::array window_spatial_lengths{{Y, X}}; + const std::array window_strides{{window_stride_h, window_stride_w}}; + const std::array input_left_pads{{in_left_pad_h, in_left_pad_w}}; + const std::array input_right_pads{{in_right_pad_h, in_right_pad_w}}; + + // tensor layout + auto f_host_tensor_descriptor = + [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { + if constexpr(ck::is_same::value) + { + return HostTensorDescriptor(std::vector({N_, C_, H, W}), + std::vector({C_ * H * W, H * W, W, 1})); + } + else if constexpr(ck::is_same::value) + { + return HostTensorDescriptor(std::vector({N_, C_, H, W}), + std::vector({C_ * H * W, 1, W * C_, C_})); + } + }; + + Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); + Tensor out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + Tensor out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + Tensor out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + Tensor out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); + + std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; + std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; + default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_n_c_ho_wo_device.mDesc.GetElementSpace()); + DeviceMem out_indices_device_buf(sizeof(int) * + out_indices_n_c_ho_wo_device.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); + + auto pool = DevicePoolFwdInstance{}; + auto invoker_ptr = pool.MakeInvokerPointer(); + auto argument_ptr = + pool.MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + static_cast(out_indices_device_buf.GetDeviceBuffer()), + N, + C, + std::array{{Hi, Wi}}, + std::array{{Y, X}}, + std::array{{Ho, Wo}}, + window_strides, + input_left_pads, + input_right_pads); + + if(!pool.IsSupportedArgument(argument_ptr.get())) + { + throw std::runtime_error("wrong! device_op with the specified compilation parameters does " + "not support this problem"); + } + + float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + + std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X; + + std::size_t num_btype = + sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + if(do_verification) + { + pool_host_verify(in_n_c_hi_wi, + out_n_c_ho_wo_host, + out_indices_n_c_ho_wo_host, + window_spatial_lengths, + window_strides, + input_left_pads, + input_right_pads); + + out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); + + check_error(out_n_c_ho_wo_host, out_n_c_ho_wo_device); + + if constexpr(NeedIndices) + { + out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); + + // check_indices(out_indices_n_c_ho_wo_host, out_indices_n_c_ho_wo_device); + }; + } +} diff --git a/example/13_reduce_blockwise/reduce_blockwise.cpp b/example/13_reduce_blockwise/reduce_blockwise.cpp new file mode 100644 index 0000000000..32cea9cb24 --- /dev/null +++ b/example/13_reduce_blockwise/reduce_blockwise.cpp @@ -0,0 +1,395 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" +#include "device_base.hpp" +#include "device_reduce_blockwise.hpp" +#include "host_reduce_util.hpp" +#include "host_generic_reduction.hpp" + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +using InDataType = half_float::half; +using OutDataType = half_float::half; +using AccDataType = float; + +using kInDataType = ck::half_t; +using kOutDataType = ck::half_t; +using kAccDataType = float; + +constexpr int Rank = 4; +using ReduceDims_ = ck::Sequence<0, 1, 2>; + +constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2; +constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; +constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; +constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES; + +using ReduceOperation = typename reduce_binary_operator::opType; +using InElementwiseOperation = + typename reduce_unary_operator::InElementwiseOperation; +using AccElementwiseOperation = + typename reduce_unary_operator::AccElementwiseOperation; + +using DeviceReduceInstance = DeviceReduceBlockWise; + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"scales", required_argument, nullptr, 'S'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + template + static T getSingleValueFromString(const std::string& valueStr) + { + std::istringstream iss(valueStr); + + T ret; + + iss >> ret; + + return (ret); + }; + + template + static std::vector getTypeValuesFromString(const char* cstr_values) + { + std::string valuesStr(cstr_values); + + std::vector values; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = valuesStr.find(',', pos); + while(new_pos != std::string::npos) + { + const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); + + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + pos = new_pos + 1; + new_pos = valuesStr.find(',', pos); + }; + + std::string sliceStr = valuesStr.substr(pos); + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + return (values); + }; + + private: + int option_index = 0; + + public: + std::vector inLengths; + std::vector scales; + + bool do_verification = false; + + int init_method = 1; + int nrepeat = 5; + + public: + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" + << std::endl; + std::cout << "--scales or -S, comma separated two float values for alpha and beta" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " + "comparing with the host-based reduction" + << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + unsigned int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:S:v:l:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'S': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + scales = getTypeValuesFromString(optarg); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + init_method = std::atoi(argv[optind++]); + nrepeat = std::atoi(argv[optind]); + + if(scales.empty()) + { + scales.push_back(1.0f); + scales.push_back(0.0f); + }; + + return (0); + }; +}; + +template +static std::vector get_reduce_dims() +{ + std::vector resDims; + + static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); }); + + return (resDims); +}; + +template +static std::vector get_invariant_dims() +{ + std::vector resDims; + unsigned int incFlag = 0; + + static_for<0, ReduceDims::Size(), 1>{}( + [&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); }); + + for(int dim = 0; dim < Rank; dim++) + { + if(incFlag & (0x1 << dim)) + continue; + resDims.push_back(dim); + }; + + return (resDims); +}; + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + SimpleAppArgs args; + + if(args.processArgs(argc, argv) < 0) + return (-1); + + constexpr bool op_support_indices = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + + constexpr bool NeedIndices = + (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES)); + + // if input is half type, no reason to use float for indiced reduction operation and must use + // float for non-indiced reduction operation for accuracy + constexpr bool invalid_reduce_1 = + std::is_same::value && + ((!op_support_indices && !std::is_same::value) || + (op_support_indices && !std::is_same::value)); + + // if input is float type, no reason to use double for indiced reduction operation + constexpr bool invalid_reduce_2 = + std::is_same::value && + (op_support_indices && !std::is_same::value); + + // indices option can only be used when it is really needed + constexpr bool invalid_reduce_3 = + (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3); + + if constexpr(invalid_reduce) + std::cout << "Reduction setting is not supported, exiting!" << std::endl; + + Tensor in(args.inLengths); + + const std::vector InvariantDims = get_invariant_dims(); + const std::vector ReduceDims = get_reduce_dims(); + + std::vector outLengths; + + if(InvariantDims.empty()) + outLengths.push_back(1); + else + for(auto dim : InvariantDims) + outLengths.push_back(args.inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + Tensor out_indices_ref(outLengths); + Tensor out_indices(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + float alpha = args.scales[0]; + float beta = args.scales[1]; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(args.do_verification) + { + switch(args.init_method) + { + case 0: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) + out.mData[i] = out_ref.mData[i]; + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); + DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; + + DeviceMem out_indices_dev(indicesSizeInBytes); + + if(args.do_verification) + { + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims); + + hostReduce.Run( + alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); + }; + + const auto i_inLengths = to_int_vector(args.inLengths); + const auto i_inStrides = to_int_vector(inStrides); + const auto i_outLengths = to_int_vector(outLengths); + const auto i_outStrides = to_int_vector(outStrides); + + auto reduce = DeviceReduceInstance{}; + + auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths); + + DeviceMem ws_dev(wsSizeInBytes); + + auto argument_ptr = + reduce.MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation{static_cast(reduce_total_length)}, + AccElementwiseOperation{static_cast(reduce_total_length)}); + + if(!reduce.IsSupportedArgument(argument_ptr.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + }; + + std::string reduce_name = reduce.GetTypeString(); + + auto invoker_ptr = reduce.MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + + invariant_total_length * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name + << std::endl; + + if(args.do_verification) + { + out_dev.FromDevice(out.mData.data()); + check_error(out_ref, out); + + if(NeedIndices) + { + out_indices_dev.FromDevice(out_indices.mData.data()); + check_indices(out_indices_ref, out_indices); + }; + }; +} diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 1f7b7ad7bd..3ebc0ee30b 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -28,6 +28,8 @@ set(CONV2D_WRW_XDL_SOURCE 13_conv2d_backward_weight_xdl/main.cpp) set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp) set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp) set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp) +set(POOL2D_FWD_SOURCE 12_pool2d_fwd/pool2d_fwd.cpp) +set(REDUCE_BLOCKWISE_SOURCE 13_reduce_blockwise/reduce_blockwise.cpp) add_executable(gemm_xdl ${GEMM_XDL_SOURCE}) add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE}) @@ -44,6 +46,8 @@ add_executable(conv2d_wrw_xdl ${CONV2D_WRW_XDL_SOURCE}) add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE}) add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE}) add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE}) +add_executable(pool2d_fwd ${POOL2D_FWD_SOURCE}) +add_executable(reduce_blockwise ${REDUCE_BLOCKWISE_SOURCE}) target_link_libraries(gemm_xdl PRIVATE host_tensor) target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor) @@ -60,4 +64,6 @@ target_link_libraries(conv2d_wrw_xdl PRIVATE host_tensor) target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor) target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor) target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor) +target_link_libraries(pool2d_fwd PRIVATE host_tensor) +target_link_libraries(reduce_blockwise PRIVATE host_tensor) diff --git a/host/host_tensor/include/device.hpp b/host/host_tensor/include/device.hpp index cb1a6effa1..87af0bbd78 100644 --- a/host/host_tensor/include/device.hpp +++ b/host/host_tensor/include/device.hpp @@ -48,6 +48,7 @@ template float launch_and_time_kernel( F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) { +#if 1 KernelTimer timer; printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", @@ -80,5 +81,10 @@ float launch_and_time_kernel( // std::this_thread::sleep_for (std::chrono::microseconds(10)); return timer.GetElapsedTime() / nrepeat; +#else + launch_kernel(kernel, grid_dim, block_dim, lds_byte, args...); + + return 0; +#endif } #endif diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp index 9285d0afd8..3d2588c08b 100644 --- a/host/host_tensor/include/host_conv.hpp +++ b/host/host_tensor/include/host_conv.hpp @@ -77,12 +77,12 @@ void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor& in, const auto X = wei.mDesc.GetLengths()[3]; const auto C = wei.mDesc.GetLengths()[4]; - auto f_ndhwc = [&](auto n, auto do__, auto ho_, auto wo_, auto k) { + auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) { // do__ must be converted to signed integer, otherwise zmin might be wrong in cases // negative values. - const int do_ = static_cast(do__); - const int ho = static_cast(ho_); - const int wo = static_cast(wo_); + const int do_ = static_cast(do_tmp); + const int ho = static_cast(ho_tmp); + const int wo = static_cast(wo_tmp); const int zmin = std::max(0, (in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) / diff --git a/host/host_tensor/include/host_generic_reduction.hpp b/host/host_tensor/include/host_generic_reduction.hpp new file mode 100644 index 0000000000..d10184aaf6 --- /dev/null +++ b/host/host_tensor/include/host_generic_reduction.hpp @@ -0,0 +1,424 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HOST_GENERIC_REDUCTION_HPP_ +#define HOST_GENERIC_REDUCTION_HPP_ + +#include +#include +#include +#include +#include +#include + +#include "reduction_enums.hpp" +#include "host_reduce_util.hpp" + +using float16 = half_float::half; + +namespace ck { + +namespace host_reduce { + +template +static void +get_all_indexes(const std::vector& dimLengths, int dim, std::vector>& indexes) +{ + if(dim < dimLengths.size()) + { + std::vector> updated_indexes; + + if(dim == 0) + { + assert(indexes.size() == 0); + assert(dimLengths[dim] > 0); + for(T i = 0; i < dimLengths[dim]; i++) + { + std::vector index = {i}; + + updated_indexes.push_back(index); + }; + } + else + { + // go through all the current indexes + for(const auto& index : indexes) + for(T i = 0; i < dimLengths[dim]; i++) + { + auto index_new = index; + index_new.push_back(i); + + updated_indexes.push_back(index_new); + }; + }; + + // update to the indexes (output) + indexes = updated_indexes; + + // further to construct the indexes from the updated status + get_all_indexes(dimLengths, dim + 1, indexes); + }; +}; + +template +static T get_offset_from_index(const std::vector& strides, const std::vector& index) +{ + T offset = 0; + + assert(strides.size() == index.size()); + + for(int i = 0; i < index.size(); i++) + offset += strides[i] * static_cast(index[i]); + + return (offset); +}; + +template +static inline T get_flatten_offset(const std::vector& lengths, const std::vector& index) +{ + T offset = 0; + + assert(lengths.size() == index.size() && lengths.size() > 0); + + int len = lengths.size(); + T stride = 1; + + // for len==1, the loop is not executed + for(int i = len - 1; i > 0; i--) + { + offset += stride * static_cast(index[i]); + + stride *= lengths[i]; + }; + + offset += stride * static_cast(index[0]); + + return (offset); +}; + +template +class ReductionHost +{ + public: + ReductionHost() = default; + ReductionHost(HostTensorDescriptor& inDesc, + HostTensorDescriptor& outDesc, + const std::vector& invariantDims_, + const std::vector& toReduceDims_) + { + this->inLengths = to_int_vector(inDesc.GetLengths()); + this->outLengths = to_int_vector(outDesc.GetLengths()); + this->inStrides = to_int_vector(inDesc.GetStrides()); + this->outStrides = to_int_vector(outDesc.GetStrides()); + + this->invariantDims = invariantDims_; + this->toReduceDims = toReduceDims_; + + assert(this->inLengths.size() == this->outLengths.size()); + assert(!this->toReduceDims.empty()); + + for(const auto dim : this->invariantDims) + this->invariantLengths.push_back(this->inLengths[dim]); + + for(const auto dim : this->toReduceDims) + toReduceLengths.push_back(this->inLengths[dim]); + + this->reduceAllDims = this->invariantDims.empty(); + }; + + ~ReductionHost(){}; + + void + Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices) + { + if constexpr(NeedIndices) + RunImpl_with_indices(alpha, in_data, beta, out_data, indices); + else + RunImpl_no_indices(alpha, in_data, beta, out_data); + }; + + private: + std::vector inLengths; + std::vector outLengths; + std::vector inStrides; + std::vector outStrides; + + std::vector invariantLengths; + std::vector toReduceLengths; + + std::vector invariantDims; + std::vector toReduceDims; + + bool reduceAllDims; + + void RunImpl_with_indices( + float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices) + { + using ck::host_reduce::binop_with_nan_check; + using ck::host_reduce::binop_with_nan_check2; + using ck::host_reduce::float_equal_one; + using ck::host_reduce::float_equal_zero; + using ck::host_reduce::PosUnaryOpFn; + using ck::host_reduce::PreUnaryOpFn; + using ck::host_reduce::ReduceOpFn2; + using ck::host_reduce::ReduceOpZeroVal; + + auto opReduce = ReduceOpFn2(); + + int divider = 1; + for(int i = 0; i < toReduceLengths.size(); i++) + divider *= toReduceLengths[i]; + + auto PreUnaryOp = PreUnaryOpFn(divider); + auto PosUnaryOp = PosUnaryOpFn(divider); + + if(reduceAllDims) + { + std::vector> indexes_1; + + get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space + + auto accuVal = ReduceOpZeroVal(); + int accuIndex = 0; + + // go through indexes of the invariant dimensions + for(const auto& src_index : indexes_1) + { + auto src_offset = get_offset_from_index(this->inStrides, src_index); + + auto currVal = static_cast(in_data[src_offset]); + + // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually + // done + PreUnaryOp(currVal); + + auto currIndex = get_flatten_offset(inLengths, src_index); + binop_with_nan_check2( + opReduce, accuVal, currVal, accuIndex, currIndex); + }; + + // scale the accumulated value + if(!float_equal_one(alpha)) + accuVal *= static_cast(alpha); + + // scale the prior dst value and add it to the accumulated value + if(!float_equal_zero(beta)) + accuVal += static_cast(out_data[0]) * static_cast(beta); + + // store the reduced value to dst location + out_data[0] = static_cast(accuVal); + indices[0] = accuIndex; + } + else + { + std::vector> indexes_1, indexes_2; + + get_all_indexes( + this->invariantLengths, 0, indexes_1); // generate the invariant indexes space + get_all_indexes( + this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space + + // go through indexes of the invariant dimensions + for(const auto& index_1 : indexes_1) + { + std::vector src_index; + std::vector dst_index; + + src_index.resize(this->inLengths.size()); + + // generate the part of src index belonging to invariant dims + for(int k = 0; k < invariantDims.size(); k++) + src_index[invariantDims[k]] = index_1[k]; + + for(int k = 0; k < invariantDims.size(); k++) + dst_index.push_back(index_1[k]); + + int dst_offset = get_offset_from_index(this->outStrides, dst_index); + + AccDataType accuVal = ReduceOpZeroVal(); + int accuIndex = 0; + + // go through indexes of the toReduce dimensions + for(const auto& index_2 : indexes_2) + { + // generate the part of src index belonging to toReduce dims + for(int k = 0; k < toReduceDims.size(); k++) + src_index[toReduceDims[k]] = index_2[k]; + + auto src_offset = get_offset_from_index(this->inStrides, src_index); + + auto currVal = static_cast(in_data[src_offset]); + // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is + // actually done + PreUnaryOp(currVal); + + auto currIndex = get_flatten_offset(toReduceLengths, index_2); + binop_with_nan_check2( + opReduce, accuVal, currVal, accuIndex, currIndex); + }; + + // scale the accumulated value + if(!float_equal_one(alpha)) + accuVal *= static_cast(alpha); + + // scale the prior dst value and add it to the accumulated value + if(!float_equal_zero(beta)) + accuVal += static_cast(out_data[dst_offset]) * + static_cast(beta); + + // store the reduced value to dst location + out_data[dst_offset] = static_cast(accuVal); + indices[dst_offset] = accuIndex; + }; + }; + }; // end of RunImpl_with_indices() + + void + RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data) + { + using ck::host_reduce::binop_with_nan_check; + using ck::host_reduce::binop_with_nan_check2; + using ck::host_reduce::float_equal_one; + using ck::host_reduce::float_equal_zero; + using ck::host_reduce::PosUnaryOpFn; + using ck::host_reduce::PreUnaryOpFn; + using ck::host_reduce::ReduceOpFn; + using ck::host_reduce::ReduceOpZeroVal; + + auto opReduce = ReduceOpFn(); + + int divider = 1; + for(int i = 0; i < toReduceLengths.size(); i++) + divider *= toReduceLengths[i]; + + auto PreUnaryOp = PreUnaryOpFn(divider); + auto PosUnaryOp = PosUnaryOpFn(divider); + + if(reduceAllDims) + { + std::vector> indexes_1; + + get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space + + auto accuVal = ReduceOpZeroVal(); + + // go through indexes of the invariant dimensions + for(const auto& src_index : indexes_1) + { + auto src_offset = get_offset_from_index(this->inStrides, src_index); + + auto currVal = static_cast(in_data[src_offset]); + + PreUnaryOp(currVal); + + binop_with_nan_check(opReduce, accuVal, currVal); + }; + + PosUnaryOp(accuVal); + + // scale the accumulated value + if(!float_equal_one(alpha)) + accuVal *= static_cast(alpha); + + // scale the prior dst value and add it to the accumulated value + if(!float_equal_zero(beta)) + accuVal += static_cast(out_data[0]) * static_cast(beta); + + // store the reduced value to dst location + out_data[0] = static_cast(accuVal); + } + else + { + std::vector> indexes_1, indexes_2; + + get_all_indexes( + this->invariantLengths, 0, indexes_1); // generate the invariant indexes space + get_all_indexes( + this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space + + // go through indexes of the invariant dimensions + for(const auto& index_1 : indexes_1) + { + std::vector src_index; + std::vector dst_index; + + src_index.resize(this->inLengths.size()); + + for(int k = 0; k < invariantDims.size(); k++) + dst_index.push_back(index_1[k]); + + int dst_offset = get_offset_from_index(this->outStrides, dst_index); + + // generate the part of src index belonging to invariant dims + for(int k = 0; k < invariantDims.size(); k++) + src_index[invariantDims[k]] = index_1[k]; + + AccDataType accuVal = ReduceOpZeroVal(); + + // go through indexes of the toReduce dimensions + for(const auto& index_2 : indexes_2) + { + // generate the part of src index belonging to toReduce dims + for(int k = 0; k < toReduceDims.size(); k++) + src_index[toReduceDims[k]] = index_2[k]; + + auto src_offset = get_offset_from_index(this->inStrides, src_index); + + auto currVal = static_cast(in_data[src_offset]); + + PreUnaryOp(currVal); + + binop_with_nan_check(opReduce, accuVal, currVal); + }; + + PosUnaryOp(accuVal); + + // scale the accumulated value + if(!float_equal_one(alpha)) + accuVal *= static_cast(alpha); + + // scale the prior dst value and add it to the accumulated value + if(!float_equal_zero(beta)) + accuVal += static_cast(out_data[dst_offset]) * + static_cast(beta); + + // store the reduced value to dst location + out_data[dst_offset] = static_cast(accuVal); + }; + }; + }; // end of RunImpl_no_indices() +}; + +}; // end of namespace host_reduce + +}; // end of namespace ck + +#endif diff --git a/host/host_tensor/include/host_reduce_util.hpp b/host/host_tensor/include/host_reduce_util.hpp new file mode 100644 index 0000000000..a176962bb1 --- /dev/null +++ b/host/host_tensor/include/host_reduce_util.hpp @@ -0,0 +1,291 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_HOST_REDUCE_UTIL_HPP +#define GUARD_HOST_REDUCE_UTIL_HPP + +#include +#include +#include +#include +#include +#include + +#include "reduction_enums.hpp" + +namespace ck { + +namespace host_reduce { + +using ck::NanPropagation_t; +using ck::ReduceTensorOp_t; + +template +static inline bool float_equal_one(T); + +static inline bool float_equal_one(float x) { return x == 1.0f; }; + +static inline bool float_equal_one(double x) { return x == 1.0; }; + +static inline bool float_equal_one(half_float::half x) +{ + return x == static_cast(1.0f); +}; + +template +static inline bool float_equal_zero(T x); + +static inline bool float_equal_zero(float x) { return x == 0.0f; }; + +static inline bool float_equal_zero(double x) { return x == 0.0; }; + +static inline bool float_equal_zero(half_float::half x) +{ + return x == static_cast(0.0f); +}; + +template +__host__ static inline std::function PreUnaryOpFn(int) +{ + using std::abs; + + if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1) + { + return ([&](compType& a_) { a_ = abs(a_); }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) + { + return ([&](compType& a_) { a_ = a_ * a_; }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) + { + return ([&](compType& a_) { a_ = abs(a_); }); + } + else + { + // ReduceTensorOp_t::AVG: + // ReduceTensorOp_t::ADD: + // ReduceTensorOp_t::MUL: + // ReduceTensorOp_t::MIN: + // ReduceTensorOp_t::MAX: + return ([&](compType&) {}); + }; +}; + +template +__host__ static inline std::function PosUnaryOpFn(int divider) +{ + using std::sqrt; + + if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) + { + return ([&](compType& a_) { a_ = sqrt(a_); }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG) + { + return ([&, divider](compType& a_) { + a_ = a_ / static_cast(static_cast(divider)); + }); + } + else + { + // ReduceTensorOp_t::ADD: + // ReduceTensorOp_t::NORM1: + // ReduceTensorOp_t::MUL: + // ReduceTensorOp_t::MIN: + // ReduceTensorOp_t::MAX: + // ReduceTensorOp_t::AMAX: + return ([&](compType&) {}); + } +}; + +template +__host__ static inline std::function ReduceOpFn() +{ + if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG || + ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2) + { + return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) + { + return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + { + return ([&](compType& a_, compType b_) { + if(a_ > b_) + a_ = b_; + }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) + { + return ([&](compType& a_, compType b_) { + if(a_ < b_) + a_ = b_; + }); + } +}; + +template +__host__ static inline std::function ReduceOpFn2() +{ + if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + { + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ > b_) + { + a_ = b_; + changed = true; + } + else + changed = false; + }); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) + { + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ < b_) + { + a_ = b_; + changed = true; + } + else + changed = false; + }); + } + else + { + // ReduceTensorOp_t::ADD: + // ReduceTensorOp_t::MUL: + // ReduceTensorOp_t::AVG: + // ReduceTensorOp_t::NORM1: + // ReduceTensorOp_t::NORM2: + return (std::function{}); + }; +}; + +template +__host__ static inline compType ReduceOpZeroVal() +{ + if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) + { + return (static_cast(1.0f)); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + { + return (std::numeric_limits::max()); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX) + { + return (std::numeric_limits::lowest()); + } + else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) + { + return (static_cast(0.0f)); + } + else + { + // ReduceTensorOp_t::ADD + // ReduceTensorOp_t::AVG + // ReduceTensorOp_t::NORM1 + // ReduceTensorOp_t::NORM2 + return (static_cast(0.0f)); + }; +}; + +template +__host__ static inline void binop_with_nan_check(std::function opReduce, + compType& accuVal, + compType currVal) +{ + using std::isnan; + + if constexpr(!PropagateNan) + { + opReduce(accuVal, currVal); + } + else + { + if(isnan(currVal)) + accuVal = currVal; + else + opReduce(accuVal, currVal); + }; +}; + +template +__host__ static inline void +binop_with_nan_check2(std::function opReduce, + compType& accuVal, + compType currVal, + int& accuIndex, + int currIndex) +{ + using std::isnan; + + if constexpr(!PropagateNan) + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + } + else + { + if(isnan(currVal)) + { + accuVal = currVal; + accuIndex = currIndex; + } + else + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + }; + }; +}; + +}; // namespace host_reduce + +static inline std::vector to_int_vector(const std::vector& inData) +{ + std::vector outData; + + for(auto elem : inData) + outData.push_back(static_cast(elem)); + + return (outData); +}; + +}; // namespace ck + +#endif diff --git a/host/host_tensor/include/host_tensor.hpp b/host/host_tensor/include/host_tensor.hpp index adaa60e843..f9f462d7fd 100644 --- a/host/host_tensor/include/host_tensor.hpp +++ b/host/host_tensor/include/host_tensor.hpp @@ -356,4 +356,28 @@ void check_error(const Tensor& ref, const Tensor& result) std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; } +template +void check_indices(const Tensor& ref, const Tensor& result) +{ + bool has_error = false; + int error_count = 0; + + for(int i = 0; i < ref.mData.size(); ++i) + { + if(ref.mData[i] != result.mData[i]) + { + std::cerr << std::endl + << "Indices different at position " << i << " (ref: " << ref.mData[i] + << ", result: " << result.mData[i] << ")" << std::endl; + has_error = true; + error_count++; + if(error_count == 20) + break; + }; + } + + if(!has_error) + std::cout << std::endl << "Indices result is completely acccurate!" << std::endl; +} + #endif diff --git a/host/host_tensor/include/host_tensor_generator.hpp b/host/host_tensor/include/host_tensor_generator.hpp index 747ec2ead4..57ad5b819d 100644 --- a/host/host_tensor/include/host_tensor_generator.hpp +++ b/host/host_tensor/include/host_tensor_generator.hpp @@ -59,7 +59,7 @@ struct GeneratorTensor_2 template T operator()(Is...) { - return (std::rand() % (max_value - min_value)) + min_value; + return static_cast((std::rand() % (max_value - min_value)) + min_value); } }; @@ -101,7 +101,7 @@ struct GeneratorTensor_3 { float tmp = float(std::rand()) / float(RAND_MAX); - return min_value + tmp * (max_value - min_value); + return static_cast(min_value + tmp * (max_value - min_value)); } }; diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index 7187147647..999c7b85cd 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -20,12 +20,13 @@ set(PROFILER_SOURCE src/profile_gemm_bias_2d.cpp src/profile_gemm_bias_relu.cpp src/profile_gemm_bias_relu_add.cpp + src/profile_batched_gemm.cpp src/profile_conv_fwd.cpp src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_atomic_add.cpp - src/profile_batched_gemm.cpp src/profile_conv_bwd_data.cpp + src/profile_reduce.cpp ) add_executable(ckProfiler ${PROFILER_SOURCE}) @@ -35,9 +36,10 @@ target_link_libraries(ckProfiler PRIVATE device_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_2d_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) +target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) -target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance) +target_link_libraries(ckProfiler PRIVATE device_reduce_instance) diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp new file mode 100644 index 0000000000..70e07a5a13 --- /dev/null +++ b/profiler/include/profile_reduce_impl.hpp @@ -0,0 +1,626 @@ +#pragma once +#include "device_reduce.hpp" +#include "device_reduce_instance.hpp" +#include "reduction_enums.hpp" +#include "host_generic_reduction.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_reduce_instance { + +template +struct ReduceDescription +{ + static constexpr int Rank_ = Rank; + static constexpr int ReduceOpId_ = ReduceOpId; + static constexpr int NanOpt_ = NanOpt; + static constexpr int IndicesOpt_ = IndicesOpt; + + using ReduceDims_ = ReduceDims; +}; + +using reduce_description_instances = + std::tuple, 0, 0, 0>, // for ADD + ReduceDescription<4, Sequence<0>, 0, 0, 0>, + ReduceDescription<2, Sequence<1>, 0, 0, 0>, + + ReduceDescription<4, Sequence<0, 1, 2>, 5, 0, 0>, // for AVG + ReduceDescription<4, Sequence<0>, 5, 0, 0>, + ReduceDescription<2, Sequence<1>, 5, 0, 0>, + + ReduceDescription<4, Sequence<0, 1, 2>, 7, 0, 0>, // for NORM2 + ReduceDescription<4, Sequence<0>, 7, 0, 0>, + ReduceDescription<2, Sequence<1>, 7, 0, 0>, + + ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 0>, // for MIN + ReduceDescription<4, Sequence<0>, 2, 0, 0>, + ReduceDescription<2, Sequence<1>, 2, 0, 0>, + ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 0>, // for MAX + ReduceDescription<4, Sequence<0>, 3, 0, 0>, + ReduceDescription<2, Sequence<1>, 3, 0, 0>, + ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 0>, // for AMAX + ReduceDescription<4, Sequence<0>, 4, 0, 0>, + ReduceDescription<2, Sequence<1>, 4, 0, 0>, + + ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 1>, // for MIN + ReduceDescription<4, Sequence<0>, 2, 0, 1>, + ReduceDescription<2, Sequence<1>, 2, 0, 1>, + ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 1>, // for MAX + ReduceDescription<4, Sequence<0>, 3, 0, 1>, + ReduceDescription<2, Sequence<1>, 3, 0, 1>, + ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 1>, // for AMAX + ReduceDescription<4, Sequence<0>, 4, 0, 1>, + ReduceDescription<2, Sequence<1>, 4, 0, 1>>; + +template +bool description_match(const DescriptionType& description, + int Rank, + const std::vector& ReduceDims, + ReduceTensorOp_t ReduceOpId, + NanPropagation_t NanOpt, + ReduceTensorIndices_t IndicesOpt) +{ + if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast(ReduceOpId) || + description.NanOpt_ != static_cast(NanOpt) || + description.IndicesOpt_ != static_cast(IndicesOpt)) + return (false); + + if(DescriptionType::ReduceDims_::Size() != ReduceDims.size()) + return (false); + + bool result = true; + + static_for<0, DescriptionType::ReduceDims_::Size(), 1>{}([&](auto i) { + if(DescriptionType::ReduceDims_::At(i) != ReduceDims[i]) + result = false; + }); + + return (result); +}; + +} // namespace device_reduce_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck + +namespace ck { +namespace profiler { + +template +static std::vector get_reduce_dims() +{ + std::vector resDims; + + static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); }); + + return (resDims); +}; + +template +static std::vector get_invariant_dims() +{ + std::vector resDims; + unsigned int incFlag = 0; + + static_for<0, ReduceDims::Size(), 1>{}( + [&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); }); + + for(int dim = 0; dim < Rank; dim++) + { + if(incFlag & (0x1 << dim)) + continue; + resDims.push_back(dim); + }; + + return (resDims); +}; + +template +static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) +{ + std::ofstream outFile(fileName, std::ios::binary); + if(outFile) + { + outFile.write(reinterpret_cast(data), dataNumItems * sizeof(T)); + outFile.close(); + std::cout << "Write output to file " << fileName << std::endl; + } + else + { + std::cout << "Could not open file " << fileName << " for writing" << std::endl; + } +}; + +// map the data type used by the GPU kernels to the corresponding type used by the host codes +template +struct type_mapping +{ + using outDataType = inDataType; +}; + +template <> +struct type_mapping +{ + using outDataType = half_float::half; +}; + +template +void profile_reduce_impl_impl(bool do_verification, + int init_method, + bool do_log, + bool do_dumpout, + int nrepeat, + const std::vector& inLengths, + float alpha, + float beta) +{ + using namespace ck::tensor_operation::device; + using namespace ck::tensor_operation::device::device_reduce_instance; + using namespace ck::host_reduce; + + constexpr bool op_support_indices = + (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || + ReduceOpId == ReduceTensorOp_t::AMAX); + + constexpr bool NeedIndices = + (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES)); + + constexpr bool PropagateNan = (NanOpt == NanPropagation_t::PROPAGATE_NAN); + + constexpr bool out_support_atomic_add = std::is_same::value; + constexpr bool op_support_atomic_add = + !op_support_indices && ReduceOpId != ReduceTensorOp_t::NORM2; + constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); + + // 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations + // 2) If InDataType is half_t, must use float as AccDataType for non-indexable reduction + // operations + constexpr bool invalid_reduce_1 = + std::is_same::value && + ((!op_support_indices && !std::is_same::value) || + (op_support_indices && !std::is_same::value)); + + // 1) If InDataType is float, must use float as AccDataType for indexable reduction operations + constexpr bool invalid_reduce_2 = + std::is_same::value && + (op_support_indices && !std::is_same::value); + + // 1) The indices can only be used when the reduction operation is indexable + constexpr bool invalid_reduce_3 = + (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + + constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3); + + if constexpr(!invalid_reduce) + { + Tensor in(inLengths); + + const std::vector OuterDims = get_invariant_dims(); + const std::vector ReduceDims = get_reduce_dims(); + + std::vector outLengths; + + if(OuterDims.empty()) + outLengths.push_back(1); + else + for(auto dim : OuterDims) + outLengths.push_back(inLengths[dim]); + + Tensor out_ref(outLengths); + Tensor out(outLengths); + Tensor out_indices_ref(outLengths); + Tensor out_indices(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = out.mDesc.GetStrides(); + + size_t invariant_total_length = out.mDesc.GetElementSize(); + size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(do_verification) + { + switch(init_method) + { + case 0: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + if(beta != 0.0f) + out_ref.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + } + + if(beta != 0.0f) + for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) + out.mData[i] = out_ref.mData[i]; + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); + DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); + + in_dev.ToDevice(in.mData.data()); + + if(beta != 0.0f) + out_dev.ToDevice(out.mData.data()); + + size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0; + + DeviceMem out_indices_dev(indicesSizeInBytes); + + float best_avg_time = 0; + float best_gb_per_sec = 0; + + using InElementwiseOperation_0 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_0 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_1 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_1 = + typename reduce_unary_operator:: + AccElementwiseOperation; + using InElementwiseOperation_2 = + typename reduce_unary_operator:: + InElementwiseOperation; + using AccElementwiseOperation_2 = + typename reduce_unary_operator:: + AccElementwiseOperation; + + using DeviceReduceInstPtr0 = + DeviceReducePtr; + using DeviceReduceInstPtr1 = + DeviceReducePtr; + using DeviceReduceInstPtr2 = + DeviceReducePtr; + + std::vector reduce0_ptrs; + std::vector reduce1_ptrs; + std::vector reduce2_ptrs; + + add_device_reduce_instance_threadwise(reduce0_ptrs); + + add_device_reduce_instance_blockwise(reduce0_ptrs); + + if constexpr(use_atomic_add) + add_device_reduce_instance_multiblock_atomic_add(reduce0_ptrs); + else + add_device_reduce_instance_multiblock_partial_reduce(reduce1_ptrs); + + // used for secondary reduction + if constexpr(!use_atomic_add) + add_device_reduce_instance_blockwise_second_call(reduce2_ptrs); + + if(reduce0_ptrs.empty() && reduce1_ptrs.empty()) + { + throw std::runtime_error("Wrong! No device REDUCE instance found"); + }; + + if(do_verification) + { + using hInType = typename type_mapping::outDataType; + using hOutType = typename type_mapping::outDataType; + using hCompType = typename type_mapping::outDataType; + + ReductionHost + hostReduce(in.mDesc, out_ref.mDesc, OuterDims, ReduceDims); + + hostReduce.Run(alpha, + reinterpret_cast(in.mData.data()), + beta, + reinterpret_cast(out_ref.mData.data()), + out_indices_ref.mData.data()); + }; + + const auto i_inLengths = to_int_vector(inLengths); + const auto i_inStrides = to_int_vector(inStrides); + const auto i_outLengths = to_int_vector(outLengths); + const auto i_outStrides = to_int_vector(outStrides); + + for(auto& reduce_ptr : reduce0_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths); + + DeviceMem ws_dev(wsSizeInBytes); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation_0{static_cast(reduce_total_length)}, + AccElementwiseOperation_0{static_cast(reduce_total_length)}); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + std::string reduce_name = reduce_ptr->GetTypeString(); + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + + std::size_t num_bytes = + invariant_total_length * reduce_total_length * sizeof(InDataType) + + invariant_total_length * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name + << std::endl; + + if(gb_per_sec > best_gb_per_sec) + { + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + out_dev.FromDevice(out.mData.data()); + check_error(out_ref, out); + + if(NeedIndices) + { + out_indices_dev.FromDevice(out_indices.mData.data()); + check_indices(out_indices_ref, out_indices); + }; + + if(do_log) + { + LogRangeAsType(std::cout << "out_host : ", out_ref.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "out_device: ", out.mData, ",") << std::endl; + }; + }; + + if(do_dumpout) + { + dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize()); + dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize()); + dumpBufferToFile( + "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize()); + if(NeedIndices) + { + dumpBufferToFile("dump_indices.bin", + out_indices.mData.data(), + out_indices.mDesc.GetElementSize()); + dumpBufferToFile("dump_indices_host.bin", + out_indices_ref.mData.data(), + out_indices_ref.mDesc.GetElementSize()); + }; + }; + }; + + for(auto& reduce_ptr : reduce1_ptrs) + { + auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths); + + DeviceMem ws_dev(wsSizeInBytes); + + auto argument_ptr = reduce_ptr->MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + alpha, + beta, + in_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation_1{static_cast(reduce_total_length)}, + AccElementwiseOperation_1{static_cast(reduce_total_length)}); + + if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) + continue; + + std::string reduce_name = reduce_ptr->GetTypeString(); + + auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + + std::size_t num_bytes = + invariant_total_length * reduce_total_length * sizeof(InDataType) + + invariant_total_length * sizeof(OutDataType); + + std::vector inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get()); + std::vector inStrides2{inLengths2[1], 1}; + + for(auto& reduce2_ptr : reduce2_ptrs) + { + auto argument2_ptr = reduce2_ptr->MakeArgumentPointer( + inLengths2, + inStrides2, + i_outLengths, + i_outStrides, + alpha, + beta, + ws_dev.GetDeviceBuffer(), + out_dev.GetDeviceBuffer(), + out_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation_2{static_cast(reduce_total_length)}, + AccElementwiseOperation_2{static_cast(reduce_total_length)}); + + if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get())) + continue; + + std::string reduce2_name = reduce2_ptr->GetTypeString(); + + auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); + + float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat); + + std::size_t num_bytes_2 = + static_cast(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType); + + float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2); + + std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec + << " GB/s, " << reduce_name << " => " << reduce2_name << std::endl; + + if(gb_per_sec > best_gb_per_sec) + { + best_avg_time = avg_time + avg_time_2; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + out_dev.FromDevice(out.mData.data()); + check_error(out_ref, out); + + if(NeedIndices) + { + out_indices_dev.FromDevice(out_indices.mData.data()); + check_indices(out_indices_ref, out_indices); + }; + + if(do_log) + { + LogRangeAsType(std::cout << "out_host : ", out_ref.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "out_device: ", out.mData, ",") + << std::endl; + } + } + + if(do_dumpout) + { + dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize()); + dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize()); + dumpBufferToFile( + "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize()); + if(NeedIndices) + { + dumpBufferToFile("dump_indices.bin", + out_indices.mData.data(), + out_indices.mDesc.GetElementSize()); + dumpBufferToFile("dump_indices_host.bin", + out_indices_ref.mData.data(), + out_indices_ref.mDesc.GetElementSize()); + }; + }; + }; + }; + + std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s" + << std::endl; + } + else + { + std::cout << "The requested reduction operation is not supported, please check !!!" + << std::endl; + }; +}; + +template +void profile_reduce_impl(bool do_verification, + int init_method, + bool do_log, + bool do_dumpout, + int nrepeat, + const std::vector& inLengths, + const std::vector& ReduceDims, + ReduceTensorOp_t ReduceOpId, + NanPropagation_t NanOpt, + ReduceTensorIndices_t IndicesOpt, + float alpha, + float beta) +{ + bool matched = false; + + using tuple_of_description_instances = + tensor_operation::device::device_reduce_instance::reduce_description_instances; + + const auto tuple_object = tuple_of_description_instances{}; + + static_for<0, std::tuple_size::value, 1>{}([&](auto i) { + if(matched) + return; + + using descType = remove_cvref_t(tuple_object))>; + + if(!description_match( + descType{}, inLengths.size(), ReduceDims, ReduceOpId, NanOpt, IndicesOpt)) + return; + + profile_reduce_impl_impl(descType::ReduceOpId_), + static_cast(descType::NanOpt_), + static_cast(descType::IndicesOpt_)>( + do_verification, init_method, do_log, do_dumpout, nrepeat, inLengths, alpha, beta); + + matched = true; + }); +}; + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp index 8d5e4e3f7f..592f10321c 100644 --- a/profiler/src/profile_gemm_bias_relu_add.cpp +++ b/profiler/src/profile_gemm_bias_relu_add.cpp @@ -59,11 +59,6 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) const int StrideC = std::stoi(argv[13]); const int StrideC1 = std::stoi(argv[14]); - int KBatch = 1; - - if(argc == 16) - KBatch = std::stoi(argv[15]); - if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_gemm_bias_relu_add_impl +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" +#include "reduction_enums.hpp" + +#include "profile_reduce_impl.hpp" + +using namespace std; + +using ck::NanPropagation_t; +using ck::ReduceTensorIndices_t; +using ck::ReduceTensorOp_t; + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"toReduceDims", required_argument, nullptr, 'R'}, + {"reduceOp", required_argument, nullptr, 'O'}, + {"compType", required_argument, nullptr, 'C'}, + {"outType", required_argument, nullptr, 'W'}, + {"nanOpt", required_argument, nullptr, 'N'}, + {"indicesOpt", required_argument, nullptr, 'I'}, + {"scales", required_argument, nullptr, 'S'}, + {"half", no_argument, nullptr, '?'}, + {"double", no_argument, nullptr, '?'}, + {"dumpout", required_argument, nullptr, 'o'}, + {"verify", required_argument, nullptr, 'v'}, + {"log", required_argument, nullptr, 'l'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +template +static T getSingleValueFromString(const string& valueStr) +{ + std::istringstream iss(valueStr); + + T val; + + iss >> val; + + return (val); +}; + +template +static std::vector getTypeValuesFromString(const char* cstr_values) +{ + std::string valuesStr(cstr_values); + + std::vector values; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = valuesStr.find(',', pos); + while(new_pos != std::string::npos) + { + const std::string sliceStr = valuesStr.substr(pos, new_pos - pos); + + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + pos = new_pos + 1; + new_pos = valuesStr.find(',', pos); + }; + + std::string sliceStr = valuesStr.substr(pos); + T val = getSingleValueFromString(sliceStr); + + values.push_back(val); + + return (values); +} + +typedef enum +{ + appHalf = 0, + appFloat = 1, + appInt32 = 2, + appInt8 = 3, + appInt8x4 = 4, + appBFloat16 = 5, + appDouble = 6, +} appDataType_t; + +static void check_reduce_dims(const int rank, const std::vector& toReduceDims) +{ + for(auto dim : toReduceDims) + { + if(dim < 0 || dim >= rank) + throw std::runtime_error("Invalid dimension index specified for Reducing"); + }; + + unsigned int flag = 0; + + for(auto dim : toReduceDims) + { + if(flag & (0x1 << dim)) + throw std::runtime_error("All toReduce dimensions should be different!"); + flag = flag | (0x1 << dim); + }; +}; + +class AppArgs +{ + private: + int option_index = 0; + + public: + bool use_half = false; + bool use_double = false; + + std::vector inLengths; + std::vector outLengths; + std::vector toReduceDims; + + std::vector scales; + + ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD; + appDataType_t compTypeId = appFloat; + appDataType_t outTypeId = appFloat; + + bool compType_assigned = false; + bool outType_assigned = false; + + NanPropagation_t nanOpt = NanPropagation_t::NOT_PROPAGATE_NAN; + ReduceTensorIndices_t indicesOpt = ReduceTensorIndices_t::NO_INDICES; + bool do_log = false; + bool do_verification = false; + bool do_dumpout = false; + + int init_method; + int nrepeat; + + bool need_indices = false; + + AppArgs() = default; + ~AppArgs() = default; + + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" + << std::endl; + std::cout << "--toReduceDims or -R, comma separated list of to-reduce dimensions" + << std::endl; + std::cout << "--reduceOp or -O, enum value indicating the reduction operations" + << std::endl; + std::cout << "--compType or -C, enum value indicating the type of accumulated values used " + "during the reduction" + << std::endl; + std::cout << "--outType or -W, optional enum value indicating the type of the reduced " + "output, which could be float when the input data is half" + << std::endl; + std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl; + std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt" + << std::endl; + std::cout << "--scales or -S, comma separated two float values for alpha and beta" + << std::endl; + std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl; + std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " + "comparing with the host-based reduction" + << std::endl; + std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files " + "for further analysis" + << std::endl; + std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + unsigned int ch; + + optind++; // to skip the "reduce" module name + + while(1) + { + ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + toReduceDims = getTypeValuesFromString(optarg); + break; + case 'O': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceOp = static_cast(std::atoi(optarg)); + break; + case 'C': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + compTypeId = static_cast(std::atoi(optarg)); + compType_assigned = true; + break; + case 'W': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + outTypeId = static_cast(std::atoi(optarg)); + outType_assigned = true; + break; + case 'N': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + nanOpt = static_cast(std::atoi(optarg)); + break; + case 'I': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + indicesOpt = static_cast(std::atoi(optarg)); + break; + case 'S': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + scales = getTypeValuesFromString(optarg); + + if(scales.size() != 2) + throw std::runtime_error("Invalid option format!"); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case 'o': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_dumpout = static_cast(std::atoi(optarg)); + break; + case 'l': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_log = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "half") + use_half = true; + else if(std::string(long_options[option_index].name) == "double") + use_double = true; + else if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + + default: + show_usage(argv[0]); + std::cerr << "Invalid cmd-line options!" << std::endl; + return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + init_method = std::atoi(argv[optind++]); + nrepeat = std::atoi(argv[optind]); + + if(scales.empty()) + { + scales.push_back(1.0f); + scales.push_back(0.0f); + }; + + if(reduceOp == ReduceTensorOp_t::MIN || reduceOp == ReduceTensorOp_t::MAX || + reduceOp == ReduceTensorOp_t::AMAX) + { + if(indicesOpt != ReduceTensorIndices_t::NO_INDICES) + need_indices = true; + + // for indexable operations, no need to assign compType and outType, just let them be + // same as inType + compType_assigned = false; + outType_assigned = false; + }; + + return (0); + }; + +}; // end of class AppArgs + +int profile_reduce(int argc, char* argv[]) +{ + using namespace ck::profiler; + + AppArgs args; + + if(args.processArgs(argc, argv) < 0) + return (-1); + + int rank = args.inLengths.size(); + + check_reduce_dims(rank, args.toReduceDims); + + if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1) + throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!"); + + if(args.use_half) + { + if(!args.compType_assigned) + args.compTypeId = appHalf; + + if(args.outType_assigned && (args.outTypeId != appHalf && args.outTypeId != appFloat)) + args.outTypeId = appFloat; + + if(!args.outType_assigned) + args.outTypeId = appHalf; + + if(args.compTypeId == appHalf) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.toReduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else if(args.compTypeId == appFloat) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.toReduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else + throw std::runtime_error("Invalid compType assignment!"); + } + else if(args.use_double) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.toReduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else + { + if(args.compTypeId == appFloat) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.toReduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else if(args.compTypeId == appDouble) + { + profile_reduce_impl(args.do_verification, + args.init_method, + args.do_log, + args.do_dumpout, + args.nrepeat, + args.inLengths, + args.toReduceDims, + args.reduceOp, + args.nanOpt, + args.indicesOpt, + args.scales[0], + args.scales[1]); + } + else + throw std::runtime_error("Invalid compType assignment!"); + }; + + return (0); +}; diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 2ea26105a0..80ce1f8324 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -2,8 +2,7 @@ #include #include #include -#include -#include +#include int profile_gemm(int, char*[]); int profile_batched_gemm(int, char*[]); @@ -15,6 +14,7 @@ int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); int profile_conv_bwd_data(int, char*[]); +int profile_reduce(int, char*[]); int main(int argc, char* argv[]) { @@ -58,6 +58,10 @@ int main(int argc, char* argv[]) { return profile_conv_bwd_data(argc, argv); } + else if(strcmp(argv[1], "reduce") == 0) + { + return profile_reduce(argc, argv); + } else { // clang-format off @@ -69,7 +73,8 @@ int main(int argc, char* argv[]) " conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n" " conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n" " conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n" - " conv_bwd: BackwardConvolution\n"); + " conv_bwd: BackwardConvolution\n" + " reduce: REDUCE\n"); // clang-format on return 0; diff --git a/script/profile_reduce_no_index.sh b/script/profile_reduce_no_index.sh new file mode 100755 index 0000000000..ff706f2d66 --- /dev/null +++ b/script/profile_reduce_no_index.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +PRECISION= ##--half + +if test -n $PRECISION && test "$PRECISION" = "--half"; then + CTYPE="-C 1" +else + CTYPE="" +fi + +WTYPE= + +if [ $# -ge 1 ] ; then + NREPEAT=$1 +else + NREPEAT=1 +fi + +Operation=7 + +## for generic validation +for op in $Operation; do + set -x + ./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + set +x +done + +Operation=5 + +## for performance evaluation (resnet50 NHWC => C) +for op in $Operation; do + set -x + ./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT + set +x +done + diff --git a/script/profile_reduce_with_index.sh b/script/profile_reduce_with_index.sh new file mode 100755 index 0000000000..109e4ef4e3 --- /dev/null +++ b/script/profile_reduce_with_index.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +PRECISION= ##--half + +if [ $# -ge 1 ] ; then + NREPEAT=$1 +else + NREPEAT=1 +fi + +Operation=4 + +LENGTHS=64,4,280,82 + +## for generic validation +for op in $Operation; do + for use_idx in 0 1; do + set -x + ./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT + set +x + done +done + +## for performance evaluation (resnet50 NHWC => C) +for op in $Operation; do + for use_idx in 0 1; do + set -x + ./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + ./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT + set +x + done +done +