From 70544c4c2cde33ea825365418d9d76b2f664a905 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 5 Apr 2022 09:31:44 +0800 Subject: [PATCH] Improve Reduction kernel api (#152) * Add ThreadwiseReduction functor as per-thread reduction api * Using ThreadwiseReduce api and some change in using PartitionedBlockwiseReduction api to simply the kernels * Add comments and remove useless declarations in the kernels * Tiny updates [ROCm/composable_kernel commit: 82c8b9f8eeffc1b9a72dc5a84137ece88e8d5941] --- .../block/reduction_functions_blockwise.hpp | 70 +++++--- .../grid/gridwise_2d_reduction_blockwise.hpp | 154 ++++++++---------- ...ise_2d_reduction_multiblock_atomic_add.hpp | 53 +++--- ...2d_reduction_multiblock_partial_reduce.hpp | 99 +++++------ .../grid/gridwise_2d_reduction_threadwise.hpp | 79 +++++---- .../thread/reduction_functions_threadwise.hpp | 122 ++++++++++++++ 6 files changed, 348 insertions(+), 229 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp diff --git a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp index 842dc6693f..cc452b5e5c 100644 --- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp @@ -26,16 +26,20 @@ #ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP #define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP -#include "data_type.hpp" - #include "reduction_common.hpp" -#include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" #include "cluster_descriptor.hpp" namespace ck { +// clang-format off +// Assume: +// 1) work_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data +// 2) work_buffer has AccDataType elements, and space size is no less than BlockSize +// 3) in_out_value is the input data in vgpr from each thread +// 4) in_out_value is the over-written reduced output in vgpr for each thread +// clang-format on template ; template - __device__ static void Reduce(BufferType& block_buffer, AccDataType& accuData) + __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value) { + static_assert(is_same{}, + "Buffer data type should be consistent as AccDataType!"); + constexpr auto cluster_len_shift = get_shift(); const auto thread_cluster_idx = @@ -71,6 +78,10 @@ struct PartitionedBlockwiseReduction const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}]; const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}]; + work_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value; + + __syncthreads(); + static_for<0, cluster_len_shift, 1>{}([&](auto I) { constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I()); @@ -80,10 +91,10 @@ struct PartitionedBlockwiseReduction index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx + make_tuple(0, indOffset)); - AccDataType opData1 = type_convert(block_buffer[offset1]); - AccDataType opData2 = type_convert(block_buffer[offset2]); + AccDataType opData1 = work_buffer[offset1]; + AccDataType opData2 = work_buffer[offset2]; Accumulation::Calculate(opData1, opData2); - block_buffer(offset1) = type_convert(opData1); + work_buffer(offset1) = opData1; } __syncthreads(); @@ -91,10 +102,17 @@ struct PartitionedBlockwiseReduction index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0)); - accuData = type_convert(block_buffer[offset]); + in_out_value = work_buffer[offset]; }; }; +// clang-format off +// Assume: +// 1) work_val_buffer/work_idx_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data +// 2) work_val_buffer/work_idx_buffer has AccDataType/IndexDataType elements, and space size is no less than BlockSize +// 3) in_out_value/in_out_index is the input data in vgpr from each thread +// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread +// clang-format on template - __device__ static void Reduce(BufferType& block_val_buffer, - IdxBufferType& block_idx_buffer, - AccDataType& accuData, - IndexDataType& accuIndex) + __device__ static void Reduce(BufferType& work_val_buffer, + IdxBufferType& work_idx_buffer, + AccDataType& in_out_value, + IndexDataType& in_out_index) { + static_assert(is_same{}, + "Buffer data type should be consistent as AccDataType!"); + static_assert(is_same{}, + "Buffer data type should be consistent as IndexDataType!"); + constexpr auto cluster_len_shift = get_shift(); const auto thread_cluster_idx = @@ -136,6 +159,11 @@ struct PartitionedBlockwiseReductionWithIndex const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}]; const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}]; + work_val_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value; + work_idx_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_index; + + __syncthreads(); + static_for<0, cluster_len_shift, 1>{}([&](auto I) { constexpr index_t indOffset = 1 << I(); @@ -145,14 +173,14 @@ struct PartitionedBlockwiseReductionWithIndex index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx + make_tuple(0, indOffset)); - AccDataType opData1 = type_convert(block_val_buffer[offset1]); - AccDataType opData2 = type_convert(block_val_buffer[offset2]); - IndexDataType currIndex1 = block_idx_buffer[offset1]; - IndexDataType currIndex2 = block_idx_buffer[offset2]; + AccDataType opData1 = work_val_buffer[offset1]; + AccDataType opData2 = work_val_buffer[offset2]; + IndexDataType currIndex1 = work_idx_buffer[offset1]; + IndexDataType currIndex2 = work_idx_buffer[offset2]; Accumulation::Calculate(opData1, opData2, currIndex1, currIndex2); - block_val_buffer(offset1) = type_convert(opData1); - block_idx_buffer(offset1) = currIndex1; + work_val_buffer(offset1) = opData1; + work_idx_buffer(offset1) = currIndex1; } __syncthreads(); @@ -160,9 +188,9 @@ struct PartitionedBlockwiseReductionWithIndex index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0)); - accuData = type_convert(block_val_buffer[offset]); - accuIndex = block_idx_buffer[offset]; - } + in_out_value = work_val_buffer[offset]; + in_out_index = work_idx_buffer[offset]; + }; }; }; // end of namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp index a81739fdeb..6826d5211c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp @@ -31,6 +31,7 @@ #include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" #include "reduction_functions_blockwise.hpp" +#include "reduction_functions_threadwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "cluster_descriptor.hpp" #include "element_wise_operation.hpp" @@ -179,10 +180,10 @@ struct GridwiseReduction_mk_to_m_blockwise static constexpr auto thread_cluster_desc = make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - // For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the - // Dim_K as the fastest one - static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); using PassThroughOp = tensor_operation::element_wise::PassThrough; @@ -216,14 +217,18 @@ struct GridwiseReduction_mk_to_m_blockwise ThreadClusterArrangeOrder, ReduceOperation, PropagateNan>; - using Accumulation = - detail::AccumulateWithNanCheck; + + using ThreadwiseReduce = ThreadwiseReduction; (void)p_ws_indices_global; (void)p_indices_global; // LDS - __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; const auto zeroVal = ReduceOperation::GetReductionZeroVal(); @@ -232,8 +237,8 @@ struct GridwiseReduction_mk_to_m_blockwise auto out_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); StaticBuffer in_thread_buf; @@ -285,38 +290,26 @@ struct GridwiseReduction_mk_to_m_blockwise make_tuple(I0, I0), in_thread_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); - }); - - // reduce on each thread-local slice - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op(in_thread_buf(Number{}), + in_thread_buf(Number{})); }); }); + ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); reducedTiles++; } while(reducedTiles < toReduceTiles); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - block_reduce_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - accu_value_buf[I]; - - accu_value_buf(I) = zeroVal; - - __syncthreads(); - - BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I)); - }); + static_for<0, MThreadSliceSize, 1>{}( + [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { if(thread_k_cluster_id == 0) @@ -414,8 +407,8 @@ struct GridwiseReduction_mk_to_m_blockwise (void)p_ws_indices_global; // LDS - __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; - __shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; + __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; const auto zeroVal = ReduceOperation::GetReductionZeroVal(); @@ -426,15 +419,18 @@ struct GridwiseReduction_mk_to_m_blockwise auto out_global_idx_buf = make_dynamic_buffer( p_indices_global, out_grid_desc_m.GetElementSpaceSize()); - auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); - auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + auto reduce_work_val_buf = + make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); + auto reduce_work_idx_buf = + make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); StaticBuffer in_thread_val_buf; - StaticBuffer + StaticBuffer in_thread_idx_buf; StaticBuffer accu_value_buf; @@ -491,42 +487,36 @@ struct GridwiseReduction_mk_to_m_blockwise make_tuple(I0, I0), in_thread_val_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); // initialize the indices for the per-thread to-reduce values - in_thread_idx_buf(offset) = - indexOffset + thread_k_cluster_id * KThreadSliceSize + J(); + in_thread_idx_buf(Number{}) = + indexOffset + thread_k_cluster_id * KThreadSliceSize + iK(); // do element-wise pre-reduction operation - in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset)); + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); }); AccDataType tmpValue = zeroVal; IndexDataType tmpIndex = 0; - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - // reduce on the dim1 thread slice - AccumulationWithIndex::Calculate( - tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + AccumulationWithIndex::Calculate(tmpValue, + in_thread_val_buf[Number{}], + tmpIndex, + in_thread_idx_buf[Number{}]); }); - // store thread local value to LDS for parallel reduction - block_reduce_val_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpValue; - block_reduce_idx_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpIndex; - - __syncthreads(); - BlockwiseReduceWithIndex::Reduce( - block_reduce_val_buf, block_reduce_idx_buf, tmpValue, tmpIndex); + reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); AccumulationWithIndex::Calculate( - accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); }); threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); @@ -535,8 +525,7 @@ struct GridwiseReduction_mk_to_m_blockwise reducedTiles++; } while(reducedTiles < toReduceTiles); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { if(thread_k_cluster_id == 0) @@ -665,8 +654,8 @@ struct GridwiseReduction_mk_to_m_blockwise (void)in_elementwise_op; // LDS - __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; - __shared__ IndexDataType p_block_reduce_idx_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; + __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize]; const auto zeroVal = ReduceOperation::GetReductionZeroVal(); @@ -681,10 +670,10 @@ struct GridwiseReduction_mk_to_m_blockwise auto out_global_idx_buf = make_dynamic_buffer( p_indices_global, out_grid_desc_m.GetElementSpaceSize()); - auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); - auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + auto reduce_work_val_buf = + make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); + auto reduce_work_idx_buf = + make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); StaticBuffer in_thread_val_buf; @@ -745,8 +734,6 @@ struct GridwiseReduction_mk_to_m_blockwise thread_m_cluster_id * MThreadSliceSize, thread_k_cluster_id * KThreadSliceSize)); - // index_t indexOffset = 0; - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; accu_index_buf(I) = 0; @@ -771,42 +758,33 @@ struct GridwiseReduction_mk_to_m_blockwise make_tuple(I0, I0), in_thread_idx_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { AccDataType tmpValue = zeroVal; IndexDataType tmpIndex = 0; - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - // reduce on the dim1 thread slice - AccumulationWithIndex::Calculate( - tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + AccumulationWithIndex::Calculate(tmpValue, + in_thread_val_buf[Number{}], + tmpIndex, + in_thread_idx_buf[Number{}]); }); - // store thread local value to LDS for parallel reduction - block_reduce_val_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpValue; - block_reduce_idx_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpIndex; - - __syncthreads(); - BlockwiseReduceWithIndex::Reduce( - block_reduce_val_buf, block_reduce_idx_buf, tmpValue, tmpIndex); + reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); AccumulationWithIndex::Calculate( - accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); }); threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); - // indexOffset += K_BlockTileSize; reducedTiles++; } while(reducedTiles < toReduceTiles); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { if(thread_k_cluster_id == 0) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp index 2d54e84954..4e325f3573 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp @@ -30,6 +30,7 @@ #include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" #include "reduction_functions_blockwise.hpp" +#include "reduction_functions_threadwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "element_wise_operation.hpp" @@ -103,10 +104,10 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add static constexpr auto thread_cluster_desc = make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - // For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the - // Dim_K as the fastest one - static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); using BlockwiseReduce = PartitionedBlockwiseReduction; + using ThreadwiseReduce = ThreadwiseReduction; + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; @@ -138,15 +145,15 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add const auto zeroVal = ReduceOperation::GetReductionZeroVal(); // LDS - __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); auto out_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); StaticBuffer in_thread_buf; @@ -198,42 +205,30 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add make_tuple(I0, I0), in_thread_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); - }); - - // reduce on each thread-local slice - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op(in_thread_buf(Number{}), + in_thread_buf(Number{})); }); }); + ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); reducedTiles++; } while(reducedTiles < num_k_block_tile_iteration); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; // Each block executes multiple parallel reductions on the LDS, and by atomic-adding its // reduced output to the global location corresponding to each invariant dimension to get a // consistent reduced result for that invariant dimension. due to the using of vector_load, // each block/thread is involved into multiple invarirant dimensions. - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - block_reduce_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - accu_value_buf[I]; - - accu_value_buf(I) = zeroVal; - - __syncthreads(); - - BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I)); - }); + static_for<0, MThreadSliceSize, 1>{}( + [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); static_for<0, MThreadSliceSize, 1>{}([&](auto I) { if(thread_k_cluster_id == 0) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp index bab95cf4d0..d1be1f5275 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp @@ -30,6 +30,7 @@ #include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" #include "reduction_functions_blockwise.hpp" +#include "reduction_functions_threadwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "cluster_descriptor.hpp" #include "element_wise_operation.hpp" @@ -121,10 +122,10 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce static constexpr auto thread_cluster_desc = make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - // For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the - // Dim_K as the fastest one - static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); using PassThroughOp = tensor_operation::element_wise::PassThrough; @@ -151,8 +152,11 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce ReduceOperation, PropagateNan>; - using Accumulation = - detail::AccumulateWithNanCheck; + using ThreadwiseReduce = ThreadwiseReduction; (void)p_ws_indices_global; (void)acc_elementwise_op; @@ -160,7 +164,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce const auto zeroVal = ReduceOperation::GetReductionZeroVal(); // LDS - __shared__ AccDataType p_block_reduce_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; const auto in_global_buf = make_dynamic_buffer(p_src_global, @@ -169,8 +173,8 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce auto workspace_global_buf = make_dynamic_buffer( p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); - auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); StaticBuffer in_thread_buf; @@ -222,20 +226,17 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce make_tuple(I0, I0), in_thread_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); - }); - - // reduce on each thread-local slice - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op(in_thread_buf(Number{}), + in_thread_buf(Number{})); }); }); + ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); reducedTiles++; @@ -243,16 +244,8 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce // Each block executes multiple parallel reductions on the LDS, and due to the using of // vector_load, each block/thread is involved into multiple invarirant dimensions. - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - block_reduce_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - accu_value_buf[I]; - - accu_value_buf(I) = zeroVal; - - __syncthreads(); - - BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I)); - }); + static_for<0, MThreadSliceSize, 1>{}( + [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); }); constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number<1>{})); @@ -315,8 +308,8 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce const auto zeroVal = ReduceOperation::GetReductionZeroVal(); // LDS - __shared__ AccDataType p_block_reduce_val_buffer[BlockSize]; - __shared__ index_t p_block_reduce_idx_buffer[BlockSize]; + __shared__ AccDataType p_reduce_work_val_buffer[BlockSize]; + __shared__ index_t p_reduce_work_idx_buffer[BlockSize]; const auto in_global_buf = make_dynamic_buffer(p_src_global, @@ -327,10 +320,10 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce auto workspace_global_idx_buf = make_dynamic_buffer( p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize()); - auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); - auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + auto reduce_work_val_buf = + make_dynamic_buffer(p_reduce_work_val_buffer, BlockSize); + auto reduce_work_idx_buf = + make_dynamic_buffer(p_reduce_work_idx_buffer, BlockSize); StaticBuffer in_thread_val_buf; @@ -394,42 +387,36 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce make_tuple(I0, I0), in_thread_val_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); // initialize the indices for the per-thread to-reduce values - in_thread_idx_buf(offset) = - indexOffset + thread_k_cluster_id * KThreadSliceSize + J(); + in_thread_idx_buf(Number{}) = + indexOffset + thread_k_cluster_id * KThreadSliceSize + iK(); // do element-wise pre-reduction operation - in_elementwise_op(in_thread_val_buf(offset), in_thread_val_buf(offset)); + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); }); AccDataType tmpValue = zeroVal; IndexDataType tmpIndex = 0; - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - // reduce on the dim1 thread slice - AccumulationWithIndex::Calculate( - tmpValue, in_thread_val_buf[offset], tmpIndex, in_thread_idx_buf[offset]); + AccumulationWithIndex::Calculate(tmpValue, + in_thread_val_buf[Number{}], + tmpIndex, + in_thread_idx_buf[Number{}]); }); - // store thread local value to LDS for parallel reduction - block_reduce_val_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpValue; - block_reduce_idx_buf(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = - tmpIndex; - - __syncthreads(); - BlockwiseReduceWithIndex::Reduce( - block_reduce_val_buf, block_reduce_idx_buf, tmpValue, tmpIndex); + reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex); AccumulationWithIndex::Calculate( - accu_value_buf(I), tmpValue, accu_index_buf(I), tmpIndex); + accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex); }); threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp index 8a4985595b..c047f7e375 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp @@ -30,6 +30,7 @@ #include "reduction_common.hpp" #include "reduction_operator.hpp" #include "reduction_functions_accumulate.hpp" +#include "reduction_functions_threadwise.hpp" #include "threadwise_tensor_slice_transfer.hpp" #include "element_wise_operation.hpp" @@ -110,6 +111,11 @@ struct GridwiseReduction_mk_to_m_threadwise using ThreadBufferDimAccessOrder = typename conditional, Sequence<0, 1>>::type; + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + using PassThroughOp = tensor_operation::element_wise::PassThrough; static constexpr auto I0 = Number<0>{}; @@ -124,9 +130,11 @@ struct GridwiseReduction_mk_to_m_threadwise OutDataType* const __restrict__ p_out_global, IndexDataType* const __restrict__ p_indices_global) { - - using Accumulation = - detail::AccumulateWithNanCheck; + using ThreadwiseReduce = ThreadwiseReduction; (void)p_indices_global; @@ -175,20 +183,17 @@ struct GridwiseReduction_mk_to_m_threadwise make_tuple(I0, I0), in_thread_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); - }); - - // reduce on each thread-local slice - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - Accumulation::Calculate(accu_value_buf(I), in_thread_buf[offset]); + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); + in_elementwise_op(in_thread_buf(Number{}), + in_thread_buf(Number{})); }); }); + ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf); + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); reducedLength += KThreadSliceSize; @@ -200,8 +205,7 @@ struct GridwiseReduction_mk_to_m_threadwise accu_value_buf(I) *= alpha; }); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; if constexpr(!BetaIsZero) { @@ -266,10 +270,13 @@ struct GridwiseReduction_mk_to_m_threadwise OutDataType* const __restrict__ p_out_global, IndexDataType* const __restrict__ p_indices_global) { - using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck; + using ThreadwiseReduceWithIndex = ThreadwiseReductionWithIndex; + (void)acc_elementwise_op; const auto zeroVal = ReduceOperation::GetReductionZeroVal(); @@ -282,7 +289,13 @@ struct GridwiseReduction_mk_to_m_threadwise p_indices_global, out_grid_desc_m.GetElementSpaceSize()); StaticBuffer - in_thread_buf; + in_thread_val_buf; + + StaticBuffer + in_thread_idx_buf; StaticBuffer accu_value_buf; StaticBuffer accu_index_buf; @@ -322,26 +335,23 @@ struct GridwiseReduction_mk_to_m_threadwise in_global_buf, thread_buffer_desc, make_tuple(I0, I0), - in_thread_buf); + in_thread_val_buf); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // do element-wise pre-reduction operation - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK)); - in_elementwise_op(in_thread_buf(offset), in_thread_buf(offset)); - }); + in_thread_idx_buf(Number{}) = indexStart + iK(); - // reduce on each thread-local slice - static_for<0, KThreadSliceSize, 1>{}([&](auto J) { - constexpr auto offset = I * Number{} + J; - AccumulationWithIndex::Calculate(accu_value_buf(I), - in_thread_buf[offset], - accu_index_buf(I), - indexStart + J); + in_elementwise_op(in_thread_val_buf(Number{}), + in_thread_val_buf(Number{})); }); }); + ThreadwiseReduceWithIndex::Reduce( + in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf); + threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step); indexStart += KThreadSliceSize; @@ -355,8 +365,7 @@ struct GridwiseReduction_mk_to_m_threadwise accu_value_buf(I) *= alpha; }); - constexpr auto reduced_data_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{}; if constexpr(!BetaIsZero) { diff --git a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp new file mode 100644 index 0000000000..3dcfe3a030 --- /dev/null +++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp @@ -0,0 +1,122 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP +#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP + +#include "reduction_functions_accumulate.hpp" + +namespace ck { + +// Assume +// 1) SrcDesc is known at compile-time +// 2) DstDesc is known at compile-time +// 3) SrcBuffer is static buffer +// 4) DstBuffer is static buffer +template +struct ThreadwiseReduction +{ + static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{}; + static constexpr auto dst_thread_desc_m = DstThreadDesc_M{}; + + static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{}); + static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{}); + static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{}); + + static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!"); + + using Accumulation = detail::AccumulateWithNanCheck; + + template + __device__ static void Reduce(const SrcBufferType& src_buf, DstBufferType& dst_buf) + { + static_for<0, src_length_m, 1>{}([&](auto iM) { + constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM)); + + static_for<0, src_length_k, 1>{}([&](auto iK) { + constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + Accumulation::Calculate(dst_buf(Number{}), src_buf[Number{}]); + }); + }); + }; +}; + +// Assume +// 1) SrcDesc is known at compile-time +// 2) DstDesc is known at compile-time +// 3) SrcBuffer is static buffer +// 4) DstBuffer is static buffer +template +struct ThreadwiseReductionWithIndex +{ + static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{}; + static constexpr auto dst_thread_desc_m = DstThreadDesc_M{}; + + static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{}); + static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{}); + static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{}); + + static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!"); + + using Accumulation = + detail::AccumulateWithIndexAndNanCheck; + + template + __device__ static void Reduce(const SrcValueBufferType& src_val_buf, + const SrcIndexBufferType& src_idx_buf, + DstValueBufferType& dst_val_buf, + DstIndexBufferType& dst_idx_buf) + { + static_for<0, src_length_m, 1>{}([&](auto iM) { + constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM)); + + static_for<0, src_length_k, 1>{}([&](auto iK) { + constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + Accumulation::Calculate(dst_val_buf(Number{}), + src_val_buf[Number{}], + dst_idx_buf(Number{}), + src_idx_buf[Number{}]); + }); + }); + }; +}; + +}; // end of namespace ck + +#endif