diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp new file mode 100644 index 0000000000..20075526b2 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -0,0 +1,613 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP +#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP + +#include "data_type.hpp" +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_blockwise.hpp" + +#include "blockwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +struct GridwiseReduction_xy_to_x_blockwise +{ + using opReduce = typename reduce_binary_operator::opType; + using preUnaryOpType = + typename reduce_unary_operator::preUnaryOp; + using posUnaryOpType = + typename reduce_unary_operator::posUnaryOp; + + static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + using blockwise_reduce = + BlockwiseReduction_2d_block_buffer; + + static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); + + static constexpr auto I0 = Number<0>{}; + + template + __device__ static void Run(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global); + + template <> + __device__ static void Run<1>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + (void)indices_global; + + // LDS + __shared__ compType p_in_block_buffer[BlockBufferSize]; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + + auto in_block_buf = + make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); + StaticBuffer accuValue_buf; + + accuValue_buf(I0) = zeroVal; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + const posUnaryOpType posUnaryOp(divider); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + + constexpr auto in_block_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); + + using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; + using ThreadClusterLengths = Sequence<1, BlockSize>; + + auto blockwise_src_load = + BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + srcDataType, + compType, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>(src2dDesc, + make_multi_index(block_global_1d_id, 0), + in_block_desc, + make_multi_index(0, 0)); + + constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); + + const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; + + for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; + reducedBlocks += GredAccessesPerThreadInBlock) + { + blockwise_src_load.RunRead(src2dDesc, src_global_buf); + blockwise_src_load.RunWrite(in_block_desc, in_block_buf); + + __syncthreads(); + + // do element-wise pre-reduction operation + blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); + + index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) + ? GredAccessesPerThreadInBlock + : toReduceBlocks - reducedBlocks; + blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); + + blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + } + + accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the block stores the reduced result to the global location + // representing the block + if(thread_local_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run( + dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + threadwise_dst_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf); + } + }; + + template <> + __device__ static void Run<2>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + + // LDS + __shared__ compType p_in_block_buffer[BlockBufferSize]; + __shared__ int block_indices_buffer[BlockBufferSize]; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + auto in_block_val_buf = + make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); + auto in_block_idx_buf = + make_dynamic_buffer(block_indices_buffer, BlockBufferSize); + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + + constexpr auto in_block_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); + + using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; + using ThreadClusterLengths = Sequence<1, BlockSize>; + + auto blockwise_src_load = + BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + srcDataType, + dstDataType, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>(src2dDesc, + make_multi_index(block_global_1d_id, 0), + in_block_desc, + make_multi_index(0, 0)); + + constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); + + const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; + + int indexOffset = 0; + + for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; + reducedBlocks += GredAccessesPerThreadInBlock) + { + // load block data from global to LDS, no use of double buffers (to be improved) + blockwise_src_load.RunRead(src2dDesc, src_global_buf); + blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); + + __syncthreads(); + + // construct the indices for the current toReduce blocks + blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); + + // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually + // done here + blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); + + index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) + ? GredAccessesPerThreadInBlock + : toReduceBlocks - reducedBlocks; + + blockwise_reduce::Reduce2(in_block_val_buf, + in_block_idx_buf, + BlocksInOneOp, + accuValue_buf(I0), + accuIndex_buf(I0)); + + indexOffset += BlockBufferSize; + + blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the block stores the reduced result to the global location + // representing the block + if(thread_local_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run(dst1dDesc, + dst_global_val_buf, + ReducedDataDesc, + make_tuple(I0), + priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + } + }; + + template <> + __device__ static void Run<3>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ ws_values_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)origReduceLen; + + // LDS + __shared__ compType p_in_block_buffer[BlockBufferSize]; + __shared__ int block_indices_buffer[BlockBufferSize]; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_val_buf = + make_dynamic_buffer(ws_values_global, + src2dDesc.GetElementSpaceSize(), + type_convert{}(zeroVal)); + const auto src_global_idx_buf = make_dynamic_buffer( + ws_indices_global, src2dDesc.GetElementSpaceSize()); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + auto in_block_val_buf = + make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); + auto in_block_idx_buf = + make_dynamic_buffer(block_indices_buffer, BlockBufferSize); + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_1d_id = get_block_1d_id(); + + constexpr auto in_block_desc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); + + using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; + using ThreadClusterLengths = Sequence<1, BlockSize>; + + auto blockwise_src_val_load = + BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + srcDataType, + compType, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>(src2dDesc, + make_multi_index(block_global_1d_id, 0), + in_block_desc, + make_multi_index(0, 0)); + + auto blockwise_src_idx_load = + BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + int, + int, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>(src2dDesc, + make_multi_index(block_global_1d_id, 0), + in_block_desc, + make_multi_index(0, 0)); + + constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); + + const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; + + for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; + reducedBlocks += GredAccessesPerThreadInBlock) + { + // load block data from global to LDS, no use of double buffers (to be improved) + blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf); + blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf); + blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf); + blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf); + + __syncthreads(); + + index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) + ? GredAccessesPerThreadInBlock + : toReduceBlocks - reducedBlocks; + + blockwise_reduce::Reduce2(in_block_val_buf, + in_block_idx_buf, + BlocksInOneOp, + accuValue_buf(I0), + accuIndex_buf(I0)); + + blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the block stores the reduced result to the global location + // representing the block + if(thread_local_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run(dst1dDesc, + dst_global_val_buf, + ReducedDataDesc, + make_tuple(I0), + priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(block_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp new file mode 100644 index 0000000000..a38c2dc335 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp @@ -0,0 +1,491 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP +#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP + +#include "data_type.hpp" +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_threadwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +struct GridwiseReduction_xy_to_x_direct_threadwise +{ + using opReduce = typename reduce_binary_operator::opType; + using preUnaryOpType = + typename reduce_unary_operator::preUnaryOp; + using posUnaryOpType = + typename reduce_unary_operator::posUnaryOp; + + static constexpr auto I0 = Number<0>{}; + + template + __device__ static void Run(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global); + + template <> + __device__ static void Run<1>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + (void)indices_global; + + const auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + using threadwise_reduce = ThreadReduce; + + StaticBuffer accuValue_buf; + + accuValue_buf(I0) = zeroVal; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + const posUnaryOpType posUnaryOp(divider); + + using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, make_multi_index(thread_global_1d_id, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); + + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += GredThreadBufferLength) + { + threadwise_src_load.Run( + src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); + + // do element-wise pre-reduction operation + threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); + + // do the reduction on the Thread Buffer + threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); + + threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>( + dst1dDesc, make_multi_index(thread_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run( + dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(thread_global_1d_id)); + + threadwise_dst_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf); + }; + + template <> + __device__ static void Run<2>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + + const auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + using threadwise_reduce = ThreadReduce; + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + + using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, make_multi_index(thread_global_1d_id, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); + + index_t indexStart = 0; + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += GredThreadBufferLength) + { + threadwise_src_load.Run( + src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); + + // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually + // done here + threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); + + // do the reduction on the Thread Buffer + threadwise_reduce::Reduce2( + in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart); + + indexStart += GredThreadBufferLength; + + threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>( + dst1dDesc, make_multi_index(thread_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run( + dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(thread_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(thread_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + }; + + template <> + __device__ static void Run<3>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ ws_values_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)origReduceLen; + + const auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_val_buf = + make_dynamic_buffer(ws_values_global, + src2dDesc.GetElementSpaceSize(), + type_convert{}(zeroVal)); + const auto src_global_idx_buf = make_dynamic_buffer( + ws_indices_global, src2dDesc.GetElementSpaceSize()); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_val_buf; + StaticBuffer in_thread_idx_buf; + + using threadwise_reduce = ThreadReduceWithIndicesInput; + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + + using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + + auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, make_multi_index(thread_global_1d_id, 0)); + + auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, make_multi_index(thread_global_1d_id, 0)); + + constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); + + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += GredThreadBufferLength) + { + threadwise_src_val_load.Run(src2dDesc, + src_global_val_buf, + ThreadBufferDesc, + make_tuple(I0, I0), + in_thread_val_buf); + threadwise_src_idx_load.Run(src2dDesc, + src_global_idx_buf, + ThreadBufferDesc, + make_tuple(I0, I0), + in_thread_idx_buf); + + // do the reduction on the Thread Buffer + threadwise_reduce::Reduce( + in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); + + threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + false>( + dst1dDesc, make_multi_index(thread_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run( + dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(thread_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + false>(dst1dDesc, + make_multi_index(thread_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp new file mode 100644 index 0000000000..0d7cef9360 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp @@ -0,0 +1,532 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP +#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP + +#include "data_type.hpp" +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_warpwise.hpp" + +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +struct GridwiseReduction_xy_to_x_direct_warpwise +{ + using opReduce = typename reduce_binary_operator::opType; + using preUnaryOpType = + typename reduce_unary_operator::preUnaryOp; + using posUnaryOpType = + typename reduce_unary_operator::posUnaryOp; + + static constexpr auto I0 = Number<0>{}; + + template + __device__ static void Run(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global); + + template <> + __device__ static void Run<1>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + (void)indices_global; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + using warpwise_reduce = + WarpReduce; + + StaticBuffer accuValue_buf; + + accuValue_buf(I0) = zeroVal; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + const posUnaryOpType posUnaryOp(divider); + + using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + index_t warp_global_1d_id = thread_global_1d_id / warpSize; + index_t thread_inwarp_id = thread_global_1d_id % warpSize; + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, + make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); + + constexpr auto in_thread_copy_step = + make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); + + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += warpSize * GredAccessesPerThreadInWarp) + { + threadwise_src_load.Run( + src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); + + // do element-wise pre-reduction operation + warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); + + // do the warp-wise reduction on data of all thread buffers + warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); + + threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the warp stores the reduced result to the global location + // representing the Warp + if(thread_inwarp_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run( + dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf(I0) * beta); + } + + auto threadwise_dst_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + threadwise_dst_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf); + } + }; + + template <> + __device__ static void Run<2>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)ws_indices_global; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_buf; + + using warpwise_reduce = + WarpReduce; + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + + using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + index_t warp_global_1d_id = thread_global_1d_id / warpSize; + index_t thread_inwarp_id = thread_global_1d_id % warpSize; + + auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, + make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); + + constexpr auto in_thread_copy_step = + make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); + + index_t indexOffset = 0; + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += warpSize * GredAccessesPerThreadInWarp) + { + threadwise_src_load.Run( + src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); + + // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually + // done here + warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); + + // do the warp-wise reduction on data of all thread buffers + warpwise_reduce::Reduce2( + in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset); + + indexOffset += warpSize * GredAccessesPerThreadInWarp; + + threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the warp stores the reduced result to the global location + // representing the Warp + if(thread_inwarp_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run(dst1dDesc, + dst_global_val_buf, + ReducedDataDesc, + make_tuple(I0), + priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + } + }; + + template <> + __device__ static void Run<3>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + srcDataType alpha, + const srcDataType* const __restrict__ ws_values_global, + dstDataType beta, + dstDataType* const __restrict__ p_dst_global, + const int* const __restrict__ ws_indices_global, + int* const __restrict__ indices_global) + { + (void)origReduceLen; + + auto zeroVal = opReduce::GetZeroVal(); + + const auto src_global_val_buf = + make_dynamic_buffer(ws_values_global, + src2dDesc.GetElementSpaceSize(), + type_convert{}(zeroVal)); + const auto src_global_idx_buf = make_dynamic_buffer( + ws_indices_global, src2dDesc.GetElementSpaceSize()); + auto dst_global_val_buf = make_dynamic_buffer( + p_dst_global, dst1dDesc.GetElementSpaceSize()); + auto dst_global_idx_buf = make_dynamic_buffer( + indices_global, dst1dDesc.GetElementSpaceSize()); + + StaticBuffer + in_thread_val_buf; + StaticBuffer + in_thread_idx_buf; + + using warpwise_reduce = WarpReduceWithIndicesInput; + + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + + using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; + constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); + index_t warp_global_1d_id = thread_global_1d_id / warpSize; + index_t thread_inwarp_id = thread_global_1d_id % warpSize; + + auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, + make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); + + auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + false>( + src2dDesc, + make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); + + constexpr auto in_thread_copy_step = + make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); + + for(index_t reducedLength = 0; reducedLength < toReduceLength; + reducedLength += warpSize * GredAccessesPerThreadInWarp) + { + threadwise_src_val_load.Run(src2dDesc, + src_global_val_buf, + ThreadBufferDesc, + make_tuple(I0, I0), + in_thread_val_buf); + threadwise_src_idx_load.Run(src2dDesc, + src_global_idx_buf, + ThreadBufferDesc, + make_tuple(I0, I0), + in_thread_idx_buf); + + // do the warp-wise reduction on data of all thread buffers + warpwise_reduce::Reduce( + in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); + + threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + // The first thread in the warp stores the reduced result to the global location + // representing the Warp + if(thread_inwarp_id == 0) + { + if(!float_equal_one{}(alpha)) + accuValue_buf(I0) *= type_convert{}(alpha); + + if(!float_equal_zero{}(beta)) + { + auto threadwise_dst_load = + ThreadwiseTensorSliceTransfer_v2, + Sequence<0>, + 0, + 1, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + StaticBuffer priorDstValue_buf; + + threadwise_dst_load.Run(dst1dDesc, + dst_global_val_buf, + ReducedDataDesc, + make_tuple(I0), + priorDstValue_buf); + + accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + } + + auto threadwise_dst_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + auto threadwise_dst_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(dst1dDesc, + make_multi_index(warp_global_1d_id)); + + threadwise_dst_val_store.Run( + ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf); + threadwise_dst_idx_store.Run( + ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp new file mode 100644 index 0000000000..c563893101 --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp @@ -0,0 +1,376 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP +#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_blockwise.hpp" + +#include "blockwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +struct GridwiseReduction_xy_to_x_multiblock +{ + using opReduce = typename reduce_binary_operator::opType; + using preUnaryOpType = typename reduce_unary_operator::preUnaryOp; + using posUnaryOpType = typename reduce_unary_operator::posUnaryOp; + + static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + using blockwise_reduce = + BlockwiseReduction_2d_block_buffer; + + static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); + + static constexpr auto I0 = Number<0>{}; + + template + __device__ static void Run(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + int BlkGroupSize, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + srcDataType* const __restrict__ ws_values_global, + int* const __restrict__ ws_indices_global); + + template <> + __device__ static void Run<1>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + int BlkGroupSize, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + srcDataType* const __restrict__ ws_values_global, + int* const __restrict__ ws_indices_global) + { + (void)ws_indices_global; + + (void)alpha; // unused + (void)beta; // unused + + auto zeroVal = opReduce::GetZeroVal(); + + // LDS + __shared__ compType p_in_block_buffer[BlockBufferSize]; + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto workspace_global_buf = make_dynamic_buffer( + ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); + + auto in_block_buf = + make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); + StaticBuffer accuValue_buf; + + accuValue_buf(I0) = zeroVal; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / BlkGroupSize; + const index_t block_local_id = block_global_id % BlkGroupSize; + + const index_t reduceSizePerBlock = + (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / + BlockBufferSize) * + BlockBufferSize; + + constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; + using ThreadClusterLengths = Sequence<1, BlockSize>; + + auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + srcDataType, + compType, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>( + src2dDesc, + make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), + in_block_desc, + make_multi_index(0, 0)); + + constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); + + const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; + + for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; + reducedBlocks += GredAccessesPerThreadInBlock) + { + blockwise_src_load.RunRead(src2dDesc, src_global_buf); + blockwise_src_load.RunWrite(in_block_desc, in_block_buf); + __syncthreads(); + + // do element-wise pre-reduction operation + blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); + + index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) + ? GredAccessesPerThreadInBlock + : toReduceBlocks - reducedBlocks; + blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); + + blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + const auto workspace_desc = + make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); + + // The first thread in the block stores the reduced result to the global location + // representing the block + if(thread_local_id == 0) + { + auto threadwise_workspace_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(workspace_desc, + make_multi_index(block_global_id)); + + threadwise_workspace_store.Run(ReducedDataDesc, + make_tuple(I0), + accuValue_buf, + workspace_desc, + workspace_global_buf); + } + }; + + template <> + __device__ static void Run<2>(const src2dDescType& src2dDesc, + const dst1dDescType& dst1dDesc, + int origReduceLen, + int BlkGroupSize, + srcDataType alpha, + const srcDataType* const __restrict__ p_src_global, + dstDataType beta, + srcDataType* const __restrict__ ws_values_global, + int* const __restrict__ ws_indices_global) + { + (void)alpha; // unused + (void)beta; // unused + + auto zeroVal = opReduce::GetZeroVal(); + + // LDS + __shared__ compType p_in_block_values_buffer[BlockBufferSize]; + __shared__ int p_in_block_indices_buffer[BlockBufferSize]; + + const auto src_global_buf = make_dynamic_buffer( + p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); + auto workspace_global_val_buf = make_dynamic_buffer( + ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); + auto workspace_global_idx_buf = make_dynamic_buffer( + ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize); + + auto in_block_val_buf = + make_dynamic_buffer(p_in_block_values_buffer, BlockBufferSize); + auto in_block_idx_buf = make_dynamic_buffer( + p_in_block_indices_buffer, BlockBufferSize); + StaticBuffer accuValue_buf; + StaticBuffer accuIndex_buf; + + accuValue_buf(I0) = zeroVal; + accuIndex_buf(I0) = 0; + + const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); + const int divider = origReduceLen; + + const preUnaryOpType preUnaryOp(divider); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / BlkGroupSize; + const index_t block_local_id = block_global_id % BlkGroupSize; + + const index_t reduceSizePerBlock = + (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / + BlockBufferSize) * + BlockBufferSize; + + constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number<1>{}, Number{})); + + using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; + using ThreadClusterLengths = Sequence<1, BlockSize>; + + auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, + ThreadSliceLengths, + ThreadClusterLengths, + Sequence<0, 1>, + srcDataType, + compType, + src2dDescType, + decltype(in_block_desc), + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + 1, + 1, + 1, + 1, + false, + true>( + src2dDesc, + make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), + in_block_desc, + make_multi_index(0, 0)); + + constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); + + const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; + + int indexOffset = block_local_id * reduceSizePerBlock; + + for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; + reducedBlocks += GredAccessesPerThreadInBlock) + { + blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); + + blockwise_src_load.RunRead(src2dDesc, src_global_buf); + blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); + + __syncthreads(); + + // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually + // done here + blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); + + index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) + ? GredAccessesPerThreadInBlock + : toReduceBlocks - reducedBlocks; + + blockwise_reduce::Reduce2(in_block_val_buf, + in_block_idx_buf, + BlocksInOneOp, + accuValue_buf(I0), + accuIndex_buf(I0)); + + indexOffset += BlockBufferSize; + + blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); + } + + constexpr auto ReducedDataDesc = + make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); + + const auto workspace_desc = + make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); + + // The first thread in the block stores the reduced result to the global location + // representing the block + if(thread_local_id == 0) + { + auto threadwise_workspace_val_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(workspace_desc, + make_multi_index(block_global_id)); + + auto threadwise_workspace_idx_store = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0>, + 0, + 1, + InMemoryDataOperationEnum_t::Set, + 1, + true>(workspace_desc, + make_multi_index(block_global_id)); + + threadwise_workspace_val_store.Run(ReducedDataDesc, + make_tuple(I0), + accuValue_buf, + workspace_desc, + workspace_global_val_buf); + threadwise_workspace_idx_store.Run(ReducedDataDesc, + make_tuple(I0), + accuIndex_buf, + workspace_desc, + workspace_global_idx_buf); + } + }; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp new file mode 100644 index 0000000000..3df257a6d9 --- /dev/null +++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp @@ -0,0 +1,271 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP +#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP + +#include "data_type.hpp" + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_binop.hpp" + +namespace ck { + +template +struct BlockwiseReduction_2d_block_buffer +{ + using compType = typename opReduce::dataType; + + static constexpr auto buffer2dDesc = buffer2dDescType{}; + + static constexpr index_t BlockSize = + blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{}); + static constexpr index_t NumBlocks = + blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{}); + using binop = detail::binop_with_nan_check; + + // This interface does not accumulate on indices + template + __device__ static void + Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData) + { + const index_t thread_local_id = get_thread_local_1d_id(); + compType lAccuData = opReduce::GetZeroVal(); + + index_t offset; + for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) + { + offset = blockIsOneRow + ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); + compType opData = type_convert{}(block_buffer[offset]); + + binop::calculate(lAccuData, opData); + } + + offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); + + block_buffer(offset) = lAccuData; + + __syncthreads(); + + for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2) + { + if(thread_local_id < indOffset) + { + index_t offset1 = + blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); + + index_t offset2 = + blockIsOneRow + ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); + + compType opData1 = type_convert{}(block_buffer[offset1]); + compType opData2 = type_convert{}(block_buffer[offset2]); + binop::calculate(opData1, opData2); + block_buffer(offset1) = type_convert{}(opData1); + } + + __syncthreads(); + } + + if(thread_local_id == 0) + { + compType tmpVal = type_convert{}(block_buffer[0]); + + binop::calculate(accuData, tmpVal); + } + }; + + // This interface accumulates on both data values and indices + template + __device__ static void Reduce2(BufferType& block_buffer, + IdxBufferType& block_indices_buffer, + index_t toReduceBlocks, + compType& accuData, + int& accuIndex) + { + const index_t thread_local_id = get_thread_local_1d_id(); + compType lAccuData = opReduce::GetZeroVal(); + int lAccuIndex = 0; + + if constexpr(blockIsOneRow) + { + for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) + { + for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) + { + if(thread_local_id % (indOffset * 2) == 0) + { + index_t offset1 = + buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)); + index_t offset2 = buffer2dDesc.CalculateOffset( + make_tuple(otherDimInd, thread_local_id + indOffset)); + + compType currVal1 = type_convert{}(block_buffer[offset1]); + compType currVal2 = type_convert{}(block_buffer[offset2]); + int currIndex1 = block_indices_buffer[offset1]; + int currIndex2 = block_indices_buffer[offset2]; + + binop::calculate(currVal1, currVal2, currIndex1, currIndex2); + block_buffer(offset1) = type_convert{}(currVal1); + block_indices_buffer(offset1) = currIndex1; + } + __syncthreads(); + } + } + + if(thread_local_id == 0) + { + for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) + { + index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0)); + + compType tmpVal = type_convert{}(block_buffer[offset]); + int tmpIndex = block_indices_buffer[offset]; + + binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); + } + + binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); + } + } + else + { + index_t offset; + + for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) + { + offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); + compType currVal = type_convert{}(block_buffer[offset]); + int currIndex = block_indices_buffer[offset]; + + binop::calculate(lAccuData, currVal, lAccuIndex, currIndex); + } + + offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); + + block_buffer(offset) = lAccuData; + block_indices_buffer(offset) = lAccuIndex; + + __syncthreads(); + + for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) + { + if(thread_local_id % (indOffset * 2) == 0) + { + index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); + index_t offset2 = + buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); + + compType currVal1 = type_convert{}(block_buffer[offset1]); + compType currVal2 = type_convert{}(block_buffer[offset2]); + int currIndex1 = block_indices_buffer[offset1]; + int currIndex2 = block_indices_buffer[offset2]; + + binop::calculate(currVal1, currVal2, currIndex1, currIndex2); + block_buffer(offset1) = type_convert{}(currVal1); + block_indices_buffer(offset1) = currIndex1; + } + + __syncthreads(); + } + + if(thread_local_id == 0) + { + compType tmpVal = type_convert{}(block_buffer[0]); + int tmpIndex = block_indices_buffer[0]; + + binop::calculate(accuData, tmpVal, accuIndex, tmpIndex); + } + } + }; + + template + __device__ static void set_buffer_value(BufferType& block_buffer, compType value) + { + index_t thread_id = get_thread_local_1d_id(); + + for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) + { + index_t offset = blockIsOneRow + ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); + + block_buffer(offset) = value; + + __syncthreads(); + } + }; + + // Initialize the block-wise indices buffer, the index for each element in the block-wise data + // buffer + // is calculated according to its position in the buffer and the global starting index + template + __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart) + { + index_t thread_id = get_thread_local_1d_id(); + + for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) + { + index_t offset = blockIsOneRow + ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); + + block_indices_buffer(offset) = offset + indexStart; + + __syncthreads(); + } + }; + + // Execute unary operation on the block buffer elements + template + __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer) + { + index_t thread_id = get_thread_local_1d_id(); + + for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) + { + index_t offset = blockIsOneRow + ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) + : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); + + block_buffer(offset) = unary_op(block_buffer[offset]); + + __syncthreads(); + } + }; +}; + +}; // end of namespace ck + +#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp new file mode 100644 index 0000000000..2956606a6b --- /dev/null +++ b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp @@ -0,0 +1,141 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP +#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP + +#include "data_type.hpp" + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_binop.hpp" + +namespace ck { + +template +struct ThreadReduce +{ + using compType = typename opReduce::dataType; + + static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); + + static_assert( + std::is_same::value, + "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); + + static constexpr index_t ThreadBufferLen = BufferType::Size(); + + using binop = detail::binop_with_nan_check; + + // This interface does not accumulate on indices + __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) + { + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { binop::calculate(accuData, thread_buffer[I]); }); + }; + + // This interface accumulates on both data values and indices and + // is called by Direct_ThreadWise reduction method at first-time reduction + __device__ static void + Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + int currIndex = I + indexStart; + binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex); + }); + }; + + // Set the elements in the per-thread buffer to a specific value + // cppcheck-suppress constParameter + __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); + }; + + // Execute unary operation on the per-thread buffer elements + template + __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) + { + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); + }; +}; + +template +struct ThreadReduceWithIndicesInput +{ + using compType = typename opReduce::dataType; + + static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); + static_assert(IdxBufferType::IsStaticBuffer(), + "Thread-wise reduction needs use StaticBuffer for indices!"); + + static_assert( + std::is_same::value, + "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); + static_assert(std::is_same::value, + "Indices type of StaticBuffer for Thread-wise reduction should be index_t!"); + + static_assert(BufferType::Size() == IdxBufferType::Size(), + "StaticBuffers for data and indices should have the same sizes!"); + + static constexpr index_t ThreadBufferLen = BufferType::Size(); + + using binop = detail::binop_with_nan_check; + + // This interface accumulates on both data values and indices and + // is called by Direct_ThreadWise reduction method at second-time reduction + __device__ static void Reduce(const BufferType& thread_buffer, + const IdxBufferType& thread_indices_buffer, + compType& accuData, + int& accuIndex) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]); + }); + }; + + // Set the elements in the per-thread buffer to a specific value + // cppcheck-suppress constParameter + __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); + }; + + // Execute unary operation on the per-thread buffer elements + template + __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) + { + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); + }; +}; + +}; // end of namespace ck + +#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp new file mode 100644 index 0000000000..a8d5750b25 --- /dev/null +++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp @@ -0,0 +1,371 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP +#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP + +#include "data_type.hpp" + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" +#include "reduction_functions_binop.hpp" + +namespace ck { + +template +struct WarpReduce +{ + using compType = typename opReduce::dataType; + using binop = detail::binop_with_nan_check; + + static_assert(BufferType::IsStaticBuffer(), + "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); + static_assert(std::is_same::value, + "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " + "the compType!"); + + static constexpr index_t ThreadBufferLen = BufferType::Size(); + static constexpr bool have_builtin_shuffle = + std::is_same::value || std::is_same::value; + + // This interface does not accumulate on indices + __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) + { + if constexpr(have_builtin_shuffle) + ReduceImpl1(thread_buffer, accuData); + else + ReduceImpl2(thread_buffer, accuData); + }; + + // This interface implementation uses HIP built-in device shuffling functions + __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData) + { + compType lAccuData = opReduce::GetZeroVal(); + + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); + + // synchronize among all threads in this warp + __all(1); + + for(index_t stride = warpSize / 2; stride > 0; stride /= 2) + { + compType tmpVal = __shfl_down(lAccuData, stride, warpSize); + binop::calculate(lAccuData, tmpVal); + __all(1); + } + + binop::calculate(accuData, lAccuData); + }; + + // This interface implementation does not use HIP built-in device shuffling functions + // since for fp16, built-in shuffling functions is not provided by HIP + __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData) + { + compType lAccuData = opReduce::GetZeroVal(); + + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); + + __syncthreads(); + + index_t thread_id = get_thread_local_1d_id(); + index_t warpId = thread_id / warpSize; + index_t thread_inwarp_id = thread_id % warpSize; + + __shared__ compType shuffle_buffer[BlockSize]; + + compType* myBuffer = &shuffle_buffer[warpId * warpSize]; + + myBuffer[thread_inwarp_id] = lAccuData; + + __syncthreads(); + + for(index_t stride = warpSize / 2; stride > 0; stride /= 2) + { + if(thread_inwarp_id < stride) + { + compType currVal1 = myBuffer[thread_inwarp_id]; + compType currVal2 = myBuffer[thread_inwarp_id + stride]; + + binop::calculate(currVal1, currVal2); + + myBuffer[thread_inwarp_id] = currVal1; + } + + __syncthreads(); + } + if(thread_inwarp_id == 0) + binop::calculate(accuData, myBuffer[0]); + }; + + // This interface accumulates on both data values and indices and is called by Direct_WarpWise + // reduction method at first-time reduction + __device__ static void + Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) + { + if constexpr(have_builtin_shuffle) + Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart); + else + Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart); + }; + + // This interface implementation uses HIP built-in device shuffling functions + __device__ static void Reduce2Impl1(const BufferType& thread_buffer, + compType& accuData, + int& accuIndex, + int indexStart) + { + compType lAccuData = opReduce::GetZeroVal(); + int lAccuIndex = 0; + index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize; + + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; + binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); + }); + + // synchronize among all threads in this warp + __all(1); + + for(index_t stride = 1; stride < warpSize; stride *= 2) + { + compType tmpVal = __shfl_down(lAccuData, stride, warpSize); + int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); + + binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); + __all(1); + } + + if(thread_inwarp_id == 0) + binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); + }; + + // This interface implementation does not use HIP built-in device shuffling functions since for + // fp16, built-in shuffling functions is not provided by HIP + __device__ static void Reduce2Impl2(const BufferType& thread_buffer, + compType& accuData, + int& accuIndex, + int indexStart) + { + compType lAccuData = opReduce::GetZeroVal(); + int lAccuIndex = 0; + index_t thread_id = get_thread_local_1d_id(); + index_t warpId = thread_id / warpSize; + index_t thread_inwarp_id = thread_id % warpSize; + + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; + binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); + }); + + __shared__ compType shuffle_data_buffer[BlockSize]; + __shared__ int shuffle_indices_buffer[BlockSize]; + + compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; + int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; + + myDataBuffer[thread_inwarp_id] = lAccuData; + myIndicesBuffer[thread_inwarp_id] = lAccuIndex; + + __syncthreads(); + + for(index_t stride = 1; stride < warpSize; stride *= 2) + { + compType currVal1 = myDataBuffer[thread_inwarp_id]; + compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; + int currIndex1 = myIndicesBuffer[thread_inwarp_id]; + int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; + + binop::calculate(currVal1, currVal2, currIndex1, currIndex2); + + myDataBuffer[thread_inwarp_id] = currVal1; + myIndicesBuffer[thread_inwarp_id] = currIndex1; + + __syncthreads(); + } + + if(thread_inwarp_id == 0) + binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); + }; + + // cppcheck-suppress constParameter + __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); + + __all(1); + }; + + // Execute unary operation on the per-thread buffer elements + template + __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) + { + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); + + __all(1); + }; +}; + +template +struct WarpReduceWithIndicesInput +{ + using compType = typename opReduce::dataType; + using binop = detail::binop_with_nan_check; + + static_assert(BufferType::IsStaticBuffer(), + "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); + static_assert(IdxBufferType::IsStaticBuffer(), + "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!"); + + static_assert(std::is_same::value, + "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " + "the compType!"); + static_assert( + std::is_same::value, + "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!"); + + static_assert(BufferType::Size() == IdxBufferType::Size(), + "StaticBuffers for data and indices should have the same sizes!"); + + static constexpr index_t ThreadBufferLen = BufferType::Size(); + static constexpr bool have_builtin_shuffle = + std::is_same::value || std::is_same::value; + + // This interface accumulates on both data values and indices and is called by Direct_WarpWise + // reduction method at second-time reduction + __device__ static void Reduce(const BufferType& thread_buffer, + const IdxBufferType& thread_indices_buffer, + compType& accuData, + int& accuIndex) + { + if constexpr(have_builtin_shuffle) + ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex); + else + ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex); + }; + + // This interface implementation uses HIP built-in device shuffling functions + __device__ static void ReduceImpl1(const BufferType& thread_buffer, + const IdxBufferType& thread_indices_buffer, + compType& accuData, + int& accuIndex) + { + compType lAccuData = opReduce::GetZeroVal(); + int lAccuIndex = 0; + + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); + }); + + // synchronize among all threads in this warp + __all(1); + + for(index_t stride = 1; stride < warpSize; stride *= 2) + { + compType tmpVal = __shfl_down(lAccuData, stride, warpSize); + int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); + + binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); + __all(1); + } + + binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); + }; + + // This interface implementation does not use HIP built-in device shuffling functions + // since for fp16, built-in shuffling functions is not provided by HIP + __device__ static void ReduceImpl2(const BufferType& thread_buffer, + const IdxBufferType& thread_indices_buffer, + compType& accuData, + int& accuIndex) + { + compType lAccuData = opReduce::GetZeroVal(); + int lAccuIndex = 0; + index_t thread_id = get_thread_local_1d_id(); + index_t warpId = thread_id / warpSize; + index_t thread_inwarp_id = thread_id % warpSize; + + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { + binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); + }); + + __shared__ compType shuffle_data_buffer[BlockSize]; + __shared__ int shuffle_indices_buffer[BlockSize]; + + compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; + int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; + + myDataBuffer[thread_inwarp_id] = lAccuData; + myIndicesBuffer[thread_inwarp_id] = lAccuIndex; + + __syncthreads(); + + for(index_t stride = 1; stride < warpSize; stride *= 2) + { + compType currVal1 = myDataBuffer[thread_inwarp_id]; + compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; + int currIndex1 = myIndicesBuffer[thread_inwarp_id]; + int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; + + binop::calculate(currVal1, currVal2, currIndex1, currIndex2); + + myDataBuffer[thread_inwarp_id] = currVal1; + myIndicesBuffer[thread_inwarp_id] = currIndex1; + + __syncthreads(); + } + + if(thread_inwarp_id == 0) + binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); + }; + + // cppcheck-suppress constParameter + __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) + { + static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); + + __all(1); + }; + + // Execute unary operation on the per-thread buffer elements + template + __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) + { + static_for<0, ThreadBufferLen, 1>{}( + [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); + + __all(1); + }; +}; + +}; // end of namespace ck + +#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index d5c77f4a54..157828bf0f 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -713,9 +713,6 @@ struct ThreadwiseTensorSliceTransfer_v3 : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) { - // TODO: fix this - static_assert(is_same::value, - "wrong! current implementation assume SrcData and DstData are same type"); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -985,7 +982,8 @@ struct ThreadwiseTensorSliceTransfer_v3 constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); - dst_tmp_vector.template AsType()(i) = buffer_[Number{}]; + dst_tmp_vector.template AsType()(i) = + type_convert{}(buffer_[Number{}]); }); using dst_vector_t = typename decltype(dst_tmp_vector)::type; diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index 7029bd850f..886737efac 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -38,6 +38,10 @@ struct DynamicBuffer return BufferAddressSpace; } + __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; } + + __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; } + template >::type, typename scalar_type>::type>::value, diff --git a/composable_kernel/include/utility/reduction_common.hpp b/composable_kernel/include/utility/reduction_common.hpp new file mode 100644 index 0000000000..139a18c2a4 --- /dev/null +++ b/composable_kernel/include/utility/reduction_common.hpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_COMMON_HPP +#define CK_REDUCTION_COMMON_HPP + +// this enumerate should be synchronized with include/miopen/reduce_common.hpp +namespace ck { +enum class ReductionMethod_t +{ + DirectThreadWise = 1, + DirectWarpWise = 2, + BlockWise = 3, + MultiBlock = 4 +}; // end of namespace ck + +enum class ReduceTensorOp_t +{ + ADD = 0, + MUL = 1, + MIN = 2, + MAX = 3, + AMAX = 4, + AVG = 5, + NORM1 = 6, + NORM2 = 7, + // MUL_NO_ZEROS = 8, +}; + +enum class NanPropagation_t +{ + NOT_PROPAGATE_NAN = 0, + PROPAGATE_NAN = 1, +}; + +enum class ReduceTensorIndices_t +{ + NO_INDICES = 0, + FLATTENED_INDICES = 1, +}; + +enum class IndicesType_t +{ + INDICES_32BIT = 0, + INDICES_64BIT = 1, + INDICES_16BIT = 2, + INDICES_8BIT = 3, +}; + +struct float_equal_one +{ + template + __device__ static inline bool apply(T x) + { + return x <= type_convert{}(1.0f) and x >= type_convert{}(1.0f); + } + + template + __device__ inline bool operator()(T x) + { + return (float_equal_one::apply(x)); + }; +}; + +struct float_equal_zero +{ + template + __device__ static inline bool apply(T x) + { + return x <= type_convert{}(0.0f) and x >= type_convert{}(0.0f); + } + + template + __device__ inline bool operator()(T x) + { + return (float_equal_zero::apply(x)); + }; +}; + +}; // end of namespace ck + +#endif diff --git a/composable_kernel/include/utility/reduction_functions_binop.hpp b/composable_kernel/include/utility/reduction_functions_binop.hpp new file mode 100644 index 0000000000..5285abee81 --- /dev/null +++ b/composable_kernel/include/utility/reduction_functions_binop.hpp @@ -0,0 +1,100 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP +#define CK_REDUCTION_FUNCTIONS_BINOP_HPP + +#include "data_type.hpp" + +#include "reduction_common.hpp" +#include "reduction_operator.hpp" + +namespace ck { +namespace detail { + +static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; + +template +struct binop_with_nan_check; + +template +struct binop_with_nan_check +{ + // cppcheck-suppress constParameter + __device__ static inline void calculate(compType& accuVal, compType currVal) + { + opReduce{}(accuVal, currVal); + }; + + // The method is called when the opReduce is indexable and the user asked for indices + __device__ static inline void + // cppcheck-suppress constParameter + calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) + { + bool changed = false; + + opReduce{}(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + }; +}; + +template +struct binop_with_nan_check +{ + __device__ static inline void calculate(compType& accuVal, compType currVal) + { + if(isnan(currVal)) + accuVal = currVal; + else + opReduce{}(accuVal, currVal); + }; + + // The method is called when the opReduce is indexable and the user asked for indices + __device__ static inline void + calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) + { + if(isnan(currVal)) + { + accuVal = currVal; + accuIndex = currIndex; + } + else + { + bool changed = false; + + opReduce{}(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + } + }; +}; + +}; // namespace detail +}; // end of namespace ck + +#endif diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp new file mode 100644 index 0000000000..269671a400 --- /dev/null +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -0,0 +1,420 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_OPERATOR_HPP +#define CK_REDUCTION_OPERATOR_HPP + +#include "reduction_common.hpp" + +namespace ck { + +namespace reduce { + +// Every binary operator used in reduction is represented by a templated functor class. Each functor +// class must provide at least +// three members: +// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator, +// "identity element" is the unique +// element in the algebraic space that doesn't affect the value of other elements +// when operated with any of them. +// 2) indexable -- boolean value indicating whether indices of the operated elements could be +// recorded. Usually, Min/Max operator could +// need to record the indices of elements. For operator like Add/Mul, no need to +// record the indices. +// 3) operator() -- the first argument of the operator must be both an input & output, and the +// corresponding variable usually stores +// the accumulated result of many operator() calls; the second argument is only an +// input. For indexable binary +// operator, the second version of operator() has third argument (which is an +// output) to indicate whether the +// accumulated value (the first argument) has changed, in which case the recorded +// accumulated index also need be +// changed. + +template +struct Add +{ + using dataType = T; + + __device__ static T GetZeroVal() { return type_convert{}(0.0f); }; + + __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } + + static constexpr bool indexable = false; +}; + +template +struct Mul +{ + using dataType = T; + + __device__ static T GetZeroVal() { return type_convert{}(1.0f); }; + + __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } + + static constexpr bool indexable = false; +}; + +template +struct Max +{ + using dataType = T; + + __device__ static T GetZeroVal() { return std::numeric_limits::min(); }; + + __device__ inline constexpr void operator()(T& a, T b) const + { + if(a < b) + a = b; + } + + __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + { + if(a < b) + { + a = b; + changed = true; + } + } + + static constexpr bool indexable = true; +}; + +template +struct Min +{ + using dataType = T; + + __device__ static T GetZeroVal() { return std::numeric_limits::max(); }; + + __device__ inline constexpr void operator()(T& a, T b) const + { + if(a > b) + a = b; + } + + __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + { + if(a > b) + { + a = b; + changed = true; + } + } + + static constexpr bool indexable = true; +}; + +template <> +__device__ half_t Max::GetZeroVal() +{ + return type_convert{}(std::numeric_limits::min()); +}; + +template <> +__device__ half_t Min::GetZeroVal() +{ + return type_convert{}(std::numeric_limits::max()); +}; + +// Unary operators are usually called element-wisely before the reduction is executed on the +// elements. +// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 +template +struct unary_identic +{ + __device__ unary_identic(const int divider = 1) + { + scaler = 1.0f / static_cast(divider); + }; + + __device__ inline constexpr T operator()(T a) const { return a * type_convert{}(scaler); }; + + float scaler = 1.0f; +}; + +template +struct unary_identic +{ + __device__ unary_identic(const int divider = 1) { (void)divider; }; + + __device__ inline constexpr T operator()(T a) const { return a; }; +}; + +template +struct unary_square +{ + __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; + + __device__ inline constexpr T operator()(T a) const + { + a = a * a; + + return a * type_convert{}(scaler); + }; + + float scaler = 1.0f; +}; + +template +struct unary_square +{ + __device__ unary_square(const int divider = 1) { (void)divider; }; + + __device__ inline constexpr T operator()(T a) const { return a * a; }; +}; + +template +struct unary_abs +{ + __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; + + __device__ inline constexpr T operator()(T a) const + { + a = abs(a); + + return a * type_convert{}(scaler); + }; + + float scaler = 1.0f; +}; + +template +struct unary_abs +{ + __device__ unary_abs(const int divider = 1) { (void)divider; }; + + __device__ inline constexpr T operator()(T a) const { return abs(a); }; +}; + +// We know for sure that 4.0 has __habs(), but 3.0 does not have it. +// Let's assume that __habs() exists since 3.5. +#if HIP_PACKAGE_VERSION_FLAT < 3005000000 +inline __device__ __half __habs(__half x) +{ + union + { + __half half; + unsigned short u16; + } val; + val.half = x; + val.u16 = val.u16 & 0x7fff; + return val.half; +} +#endif + +template +struct unary_abs +{ + __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; + + __device__ inline half_t operator()(half_t a) const + { + a = static_cast(__habs(a)); + + return a * type_convert{}(scaler); + }; + + float scaler = 1.0f; +}; + +template <> +struct unary_abs +{ + __device__ unary_abs(const int divider = 1) { (void)divider; }; + + __device__ inline half_t operator()(half_t a) const { return static_cast(__habs(a)); }; +}; + +template +struct unary_sqrt +{ + __device__ unary_sqrt(const int divider = 1) { (void)divider; }; + + __device__ inline T operator()(T a) const { return sqrtf(a); }; +}; + +template <> +struct unary_sqrt +{ + __device__ unary_sqrt(const int divider = 1) { (void)divider; }; + + __device__ inline half_t operator()(half_t a) const { return static_cast(hsqrt(a)); }; +}; + +}; // end of namespace reduce + +// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their +// respective functor classes. +// The "GetZeroVal()" interface and boolean member "indexable" are also provided in +// reduce_binary_operactor for +// easier checking by the upper-layer codes in the kernels. + +template +struct reduce_binary_operator; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Add::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Mul; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Mul::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Mul::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Min; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Min::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Min::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Max; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Max::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Max::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Max; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Max::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Max::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Add::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Add::indexable; +}; + +template +struct reduce_binary_operator +{ + using opType = reduce::Add; + using dataType = T; + + __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; + + static constexpr bool indexable = reduce::Add::indexable; +}; + +// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary +// functor classes. +// The two unary functors are called before and afer the Reduction is executed respectively +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_identic; + using posUnaryOp = reduce::unary_identic; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_identic; + using posUnaryOp = reduce::unary_identic; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_abs; + using posUnaryOp = reduce::unary_identic; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_abs; + using posUnaryOp = reduce::unary_identic; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_square; + using posUnaryOp = reduce::unary_identic; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_square; + using posUnaryOp = reduce::unary_sqrt; +}; + +template +struct reduce_unary_operator +{ + using preUnaryOp = reduce::unary_identic; + using posUnaryOp = reduce::unary_sqrt; +}; + +} // end of namespace ck + +#endif diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp new file mode 100644 index 0000000000..e16010dee1 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -0,0 +1,317 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_blockwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge, toReduceDims>::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)GridSize; + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto one_dim_srcDesc = transform_tensor_descriptor( + srcDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + auto src2dDesc = transform_tensor_descriptor( + one_dim_srcDesc, + make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + + if constexpr(src2d_need_padding) + { + const auto srcPad = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_src2dDesc = + transform_tensor_descriptor(ref_one_dim_srcDesc, + make_tuple(make_unmerge_transform( + make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = + typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp new file mode 100644 index 0000000000..cba7ffe295 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp @@ -0,0 +1,318 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_blockwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)GridSize; + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); + + auto src2dDesc = + transform_tensor_descriptor(srcDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + + if constexpr(src2d_need_padding) + { + const auto srcPad = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_toReduceDimLengths = + typename uniform_sequence_gen::type{}; + static constexpr auto ref_invariantDimLengths = + typename uniform_sequence_gen::type{}; + + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_src2dDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), + make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp new file mode 100644 index 0000000000..34b877027c --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp @@ -0,0 +1,323 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_multiblock.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; // this could be empty + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge, toReduceDims>::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)GridSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto one_dim_srcDesc = transform_tensor_descriptor( + srcDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + auto src2dDesc = transform_tensor_descriptor( + one_dim_srcDesc, + make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + const index_t reduceSizePerBlock = + (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * + copySliceLen; + + if constexpr(src2d_need_padding) + { + const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_src2dDesc = + transform_tensor_descriptor(ref_one_dim_srcDesc, + make_tuple(make_unmerge_transform( + make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = + typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_dst_global; + (void)indices_global; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + void* ws_buf1_global = static_cast(ws_global) + 4096; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + BlkGroupSize, + alpha, + static_cast(p_src_global), + beta, + static_cast(ws_buf1_global), + static_cast(ws_buf2_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp new file mode 100644 index 0000000000..9c7318dc15 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp @@ -0,0 +1,323 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_multiblock.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)GridSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); + + auto src2dDesc = + transform_tensor_descriptor(srcDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + const index_t reduceSizePerBlock = + (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * + copySliceLen; + + if constexpr(src2d_need_padding) + { + const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_toReduceDimLengths = + typename uniform_sequence_gen::type{}; + static constexpr auto ref_invariantDimLengths = + typename uniform_sequence_gen::type{}; + + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_src2dDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), + make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_dst_global; + (void)indices_global; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + void* ws_buf1_global = static_cast(ws_global) + 4096; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + BlkGroupSize, + alpha, + static_cast(p_src_global), + beta, + static_cast(ws_buf1_global), + static_cast(ws_buf2_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp new file mode 100644 index 0000000000..8e67d1faa1 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp @@ -0,0 +1,330 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge, toReduceDims>::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto one_dim_srcDesc = transform_tensor_descriptor( + srcDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + auto src2dDesc = transform_tensor_descriptor( + one_dim_srcDesc, + make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = GredThreadBufferLength; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_src2dDesc = + transform_tensor_descriptor(ref_one_dim_srcDesc, + make_tuple(make_unmerge_transform( + make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp new file mode 100644 index 0000000000..fdbcda64ba --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp @@ -0,0 +1,331 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); + + auto src2dDesc = + transform_tensor_descriptor(srcDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = GredThreadBufferLength; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_toReduceDimLengths = + typename uniform_sequence_gen::type{}; + static constexpr auto ref_invariantDimLengths = + typename uniform_sequence_gen::type{}; + + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_src2dDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), + make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp new file mode 100644 index 0000000000..8aa1376c3a --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp @@ -0,0 +1,332 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge, toReduceDims>::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto one_dim_srcDesc = transform_tensor_descriptor( + srcDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + auto src2dDesc = transform_tensor_descriptor( + one_dim_srcDesc, + make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), + make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_src2dDesc = + transform_tensor_descriptor(ref_one_dim_srcDesc, + make_tuple(make_unmerge_transform( + make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = + GridwiseReduction_xy_to_x_direct_warpwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp new file mode 100644 index 0000000000..e18d623fe5 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp @@ -0,0 +1,333 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, + int BlkGroupSize, + int inLength0, + int inLength1, + int inLength2, + int inLength3, + int inLength4, + int inLength5, + int inStride0, + int inStride1, + int inStride2, + int inStride3, + int inStride4, + int inStride5, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; + const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); + + auto src2dDesc = + transform_tensor_descriptor(srcDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(toReduceDimLengths)), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLen = src2dDesc.GetLength(Number<0>{}); + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + + constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_toReduceDimLengths = + typename uniform_sequence_gen::type{}; + static constexpr auto ref_invariantDimLengths = + typename uniform_sequence_gen::type{}; + + static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; + static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; + + // don't have to use accurate strides to get an expected referrence type + static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( + make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + + static constexpr auto ref_src2dDesc = transform_tensor_descriptor( + ref_srcDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), + make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), + make_tuple(invariantDims{}, toReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); + static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, + int BlkGroupSize, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)BlkGroupSize; + (void)ws_buf2_bytes_offset; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = + GridwiseReduction_xy_to_x_direct_warpwise; + + constexpr int RunId = need_indices ? 2 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(p_src_global), + beta, + static_cast(p_dst_global), + static_cast(nullptr), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp new file mode 100644 index 0000000000..b7b58cbb90 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp @@ -0,0 +1,282 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_blockwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; // this could be empty + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, + int BlkGroupSize, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)GridSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + + if constexpr(src2d_need_padding) + { + const auto srcPad = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = + make_tuple_from_seq(typename uniform_sequence_gen::type{}); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(ref_tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + void* ws_buf1_global = static_cast(ws_global) + 4096; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp new file mode 100644 index 0000000000..ef88547028 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp @@ -0,0 +1,296 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; // this could be empty + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, + int BlkGroupSize, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = GredThreadBufferLength; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = + make_tuple_from_seq(typename uniform_sequence_gen::type{}); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(ref_tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + void* ws_buf1_global = static_cast(ws_global) + 4096; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp new file mode 100644 index 0000000000..53b0e1e759 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp @@ -0,0 +1,298 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; // this could be empty + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +//////////////////////////////////////////////////////////////////////////////////////// +using specDims = typename sequence_merge::type; + +static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, + "Wrong invariant and/or toReduce dimensions!"); + +// The number of invariant dimensions can be zero if all dimension are to be reduced +static_assert(invariantDims::Size() > 0 || dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable + +// helper functions using variadic template arguments +template +__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) +{ + return make_tuple(static_cast(lengths[Ns])...); +}; + +template +__device__ static auto make_tuple_from_array(const int* lengths, Number) +{ + static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); + + constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; + + return make_tuple_from_array_and_index_seq(lengths, index_seq); +}; + +template +__device__ static constexpr auto make_tuple_from_seq(Sequence) +{ + return make_tuple(Ns...); +}; + +extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, + int BlkGroupSize, + int outLength0, + int outLength1, + int outLength2, + int outLength3, + int outLength4, + int outLength5, + int outStride0, + int outStride1, + int outStride2, + int outStride3, + int outStride4, + int outStride5, + void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const int dstLengths[6] = { + outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; + const int dstStrides[6] = { + outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; + + const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + + const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + auto dst1dDesc = transform_tensor_descriptor( + dstDesc, + make_tuple(make_merge_transform(tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dst1dDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(hipThreadIdx_x == 0) + *static_cast(p_dst1dDesc) = dst1dDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = + make_tuple_from_seq(typename uniform_sequence_gen::type{}); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( + ref_dstDesc, + make_tuple(make_merge_transform(ref_tupleDstLengths)), + make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), + make_tuple(Sequence<0>{})); + + static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dst1dDesc); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dst1dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = + typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = + typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types:: + refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = + typename get_ref_desc_types:: + refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + void* __restrict__ ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = ws_global; + const void* p_dst1dDesc = static_cast(ws_global) + 2048; + void* ws_buf1_global = static_cast(ws_global) + 4096; + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = + GridwiseReduction_xy_to_x_direct_warpwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +};