mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
Merge pull request #31 from ROCmSoftwarePlatform/miopen_downstream-dynamic_reduction_pr
[MIOpen Downstream] Dynamic Reduction PR
This commit is contained in:
@@ -0,0 +1,613 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
|
||||
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_blockwise.hpp"
|
||||
|
||||
#include "blockwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename srcDataType,
|
||||
typename dstDataType,
|
||||
typename compType,
|
||||
typename src2dDescType,
|
||||
typename dst1dDescType,
|
||||
ReduceTensorOp_t op,
|
||||
NanPropagation_t nanPropaOpt,
|
||||
ReduceTensorIndices_t reduceIndicesOpt,
|
||||
bool isFirstCall,
|
||||
bool isLastCall,
|
||||
index_t GredAccessesPerThreadInBlock>
|
||||
struct GridwiseReduction_xy_to_x_blockwise
|
||||
{
|
||||
using opReduce = typename reduce_binary_operator<compType, op>::opType;
|
||||
using preUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
|
||||
using posUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
|
||||
|
||||
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
|
||||
using blockwise_reduce =
|
||||
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
|
||||
|
||||
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <int RunId>
|
||||
__device__ static void Run(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global);
|
||||
|
||||
template <>
|
||||
__device__ static void Run<1>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
(void)indices_global;
|
||||
|
||||
// LDS
|
||||
__shared__ compType p_in_block_buffer[BlockBufferSize];
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
auto in_block_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
const posUnaryOpType posUnaryOp(divider);
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_1d_id = get_block_1d_id();
|
||||
|
||||
constexpr auto in_block_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
|
||||
|
||||
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
|
||||
using ThreadClusterLengths = Sequence<1, BlockSize>;
|
||||
|
||||
auto blockwise_src_load =
|
||||
BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(src2dDesc,
|
||||
make_multi_index(block_global_1d_id, 0),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
|
||||
|
||||
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
|
||||
|
||||
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
|
||||
reducedBlocks += GredAccessesPerThreadInBlock)
|
||||
{
|
||||
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
|
||||
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// do element-wise pre-reduction operation
|
||||
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
|
||||
|
||||
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
|
||||
? GredAccessesPerThreadInBlock
|
||||
: toReduceBlocks - reducedBlocks;
|
||||
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
|
||||
|
||||
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
}
|
||||
|
||||
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the block stores the reduced result to the global location
|
||||
// representing the block
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(
|
||||
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
threadwise_dst_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<2>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
|
||||
// LDS
|
||||
__shared__ compType p_in_block_buffer[BlockBufferSize];
|
||||
__shared__ int block_indices_buffer[BlockBufferSize];
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
auto in_block_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
|
||||
auto in_block_idx_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_1d_id = get_block_1d_id();
|
||||
|
||||
constexpr auto in_block_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
|
||||
|
||||
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
|
||||
using ThreadClusterLengths = Sequence<1, BlockSize>;
|
||||
|
||||
auto blockwise_src_load =
|
||||
BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(src2dDesc,
|
||||
make_multi_index(block_global_1d_id, 0),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
|
||||
|
||||
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
|
||||
|
||||
int indexOffset = 0;
|
||||
|
||||
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
|
||||
reducedBlocks += GredAccessesPerThreadInBlock)
|
||||
{
|
||||
// load block data from global to LDS, no use of double buffers (to be improved)
|
||||
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
|
||||
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// construct the indices for the current toReduce blocks
|
||||
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
|
||||
|
||||
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
|
||||
// done here
|
||||
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
|
||||
|
||||
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
|
||||
? GredAccessesPerThreadInBlock
|
||||
: toReduceBlocks - reducedBlocks;
|
||||
|
||||
blockwise_reduce::Reduce2(in_block_val_buf,
|
||||
in_block_idx_buf,
|
||||
BlocksInOneOp,
|
||||
accuValue_buf(I0),
|
||||
accuIndex_buf(I0));
|
||||
|
||||
indexOffset += BlockBufferSize;
|
||||
|
||||
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the block stores the reduced result to the global location
|
||||
// representing the block
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(dst1dDesc,
|
||||
dst_global_val_buf,
|
||||
ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<3>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ ws_values_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)origReduceLen;
|
||||
|
||||
// LDS
|
||||
__shared__ compType p_in_block_buffer[BlockBufferSize];
|
||||
__shared__ int block_indices_buffer[BlockBufferSize];
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
|
||||
src2dDesc.GetElementSpaceSize(),
|
||||
type_convert<srcDataType>{}(zeroVal));
|
||||
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_indices_global, src2dDesc.GetElementSpaceSize());
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
auto in_block_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
|
||||
auto in_block_idx_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_1d_id = get_block_1d_id();
|
||||
|
||||
constexpr auto in_block_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
|
||||
|
||||
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
|
||||
using ThreadClusterLengths = Sequence<1, BlockSize>;
|
||||
|
||||
auto blockwise_src_val_load =
|
||||
BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(src2dDesc,
|
||||
make_multi_index(block_global_1d_id, 0),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
auto blockwise_src_idx_load =
|
||||
BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
int,
|
||||
int,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(src2dDesc,
|
||||
make_multi_index(block_global_1d_id, 0),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
|
||||
|
||||
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
|
||||
|
||||
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
|
||||
reducedBlocks += GredAccessesPerThreadInBlock)
|
||||
{
|
||||
// load block data from global to LDS, no use of double buffers (to be improved)
|
||||
blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf);
|
||||
blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf);
|
||||
blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf);
|
||||
blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
|
||||
? GredAccessesPerThreadInBlock
|
||||
: toReduceBlocks - reducedBlocks;
|
||||
|
||||
blockwise_reduce::Reduce2(in_block_val_buf,
|
||||
in_block_idx_buf,
|
||||
BlocksInOneOp,
|
||||
accuValue_buf(I0),
|
||||
accuIndex_buf(I0));
|
||||
|
||||
blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the block stores the reduced result to the global location
|
||||
// representing the block
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(dst1dDesc,
|
||||
dst_global_val_buf,
|
||||
ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(block_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,491 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
|
||||
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_threadwise.hpp"
|
||||
|
||||
#include "threadwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename srcDataType,
|
||||
typename dstDataType,
|
||||
typename compType,
|
||||
typename src2dDescType,
|
||||
typename dst1dDescType,
|
||||
ReduceTensorOp_t op,
|
||||
NanPropagation_t nanPropaOpt,
|
||||
ReduceTensorIndices_t reduceIndicesOpt,
|
||||
bool isFirstCall,
|
||||
bool isLastCall,
|
||||
index_t GredThreadBufferLength>
|
||||
struct GridwiseReduction_xy_to_x_direct_threadwise
|
||||
{
|
||||
using opReduce = typename reduce_binary_operator<compType, op>::opType;
|
||||
using preUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
|
||||
using posUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <int RunId>
|
||||
__device__ static void Run(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global);
|
||||
|
||||
template <>
|
||||
__device__ static void Run<1>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
(void)indices_global;
|
||||
|
||||
const auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
|
||||
in_thread_buf;
|
||||
|
||||
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
const posUnaryOpType posUnaryOp(divider);
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc, make_multi_index(thread_global_1d_id, 0));
|
||||
|
||||
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
|
||||
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += GredThreadBufferLength)
|
||||
{
|
||||
threadwise_src_load.Run(
|
||||
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
|
||||
|
||||
// do element-wise pre-reduction operation
|
||||
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
|
||||
|
||||
// do the reduction on the Thread Buffer
|
||||
threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
true>(
|
||||
dst1dDesc, make_multi_index(thread_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(
|
||||
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(thread_global_1d_id));
|
||||
|
||||
threadwise_dst_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<2>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
|
||||
const auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
|
||||
in_thread_buf;
|
||||
|
||||
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
dstDataType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc, make_multi_index(thread_global_1d_id, 0));
|
||||
|
||||
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
|
||||
|
||||
index_t indexStart = 0;
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += GredThreadBufferLength)
|
||||
{
|
||||
threadwise_src_load.Run(
|
||||
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
|
||||
|
||||
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
|
||||
// done here
|
||||
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
|
||||
|
||||
// do the reduction on the Thread Buffer
|
||||
threadwise_reduce::Reduce2(
|
||||
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart);
|
||||
|
||||
indexStart += GredThreadBufferLength;
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
dst1dDesc, make_multi_index(thread_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(
|
||||
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(thread_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(thread_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<3>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ ws_values_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)origReduceLen;
|
||||
|
||||
const auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
|
||||
src2dDesc.GetElementSpaceSize(),
|
||||
type_convert<srcDataType>{}(zeroVal));
|
||||
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_indices_global, src2dDesc.GetElementSpaceSize());
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
|
||||
in_thread_val_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredThreadBufferLength, true> in_thread_idx_buf;
|
||||
|
||||
using threadwise_reduce = ThreadReduceWithIndicesInput<decltype(in_thread_val_buf),
|
||||
decltype(in_thread_idx_buf),
|
||||
opReduce,
|
||||
nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
|
||||
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
dstDataType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc, make_multi_index(thread_global_1d_id, 0));
|
||||
|
||||
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
|
||||
int,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc, make_multi_index(thread_global_1d_id, 0));
|
||||
|
||||
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
|
||||
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += GredThreadBufferLength)
|
||||
{
|
||||
threadwise_src_val_load.Run(src2dDesc,
|
||||
src_global_val_buf,
|
||||
ThreadBufferDesc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_val_buf);
|
||||
threadwise_src_idx_load.Run(src2dDesc,
|
||||
src_global_idx_buf,
|
||||
ThreadBufferDesc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_idx_buf);
|
||||
|
||||
// do the reduction on the Thread Buffer
|
||||
threadwise_reduce::Reduce(
|
||||
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
|
||||
|
||||
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
dst1dDesc, make_multi_index(thread_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(
|
||||
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(thread_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
false>(dst1dDesc,
|
||||
make_multi_index(thread_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,532 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
|
||||
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_warpwise.hpp"
|
||||
|
||||
#include "threadwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename srcDataType,
|
||||
typename dstDataType,
|
||||
typename compType,
|
||||
typename src2dDescType,
|
||||
typename dst1dDescType,
|
||||
ReduceTensorOp_t op,
|
||||
NanPropagation_t nanPropaOpt,
|
||||
ReduceTensorIndices_t reduceIndicesOpt,
|
||||
bool isFirstCall,
|
||||
bool isLastCall,
|
||||
index_t GredAccessesPerThreadInWarp>
|
||||
struct GridwiseReduction_xy_to_x_direct_warpwise
|
||||
{
|
||||
using opReduce = typename reduce_binary_operator<compType, op>::opType;
|
||||
using preUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
|
||||
using posUnaryOpType =
|
||||
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <int RunId>
|
||||
__device__ static void Run(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global);
|
||||
|
||||
template <>
|
||||
__device__ static void Run<1>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
(void)indices_global;
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
|
||||
in_thread_buf;
|
||||
|
||||
using warpwise_reduce =
|
||||
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
const posUnaryOpType posUnaryOp(divider);
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc,
|
||||
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
|
||||
|
||||
constexpr auto in_thread_copy_step =
|
||||
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
|
||||
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += warpSize * GredAccessesPerThreadInWarp)
|
||||
{
|
||||
threadwise_src_load.Run(
|
||||
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
|
||||
|
||||
// do element-wise pre-reduction operation
|
||||
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
|
||||
|
||||
// do the warp-wise reduction on data of all thread buffers
|
||||
warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the warp stores the reduced result to the global location
|
||||
// representing the Warp
|
||||
if(thread_inwarp_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(
|
||||
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf(I0) * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
threadwise_dst_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<2>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
|
||||
in_thread_buf;
|
||||
|
||||
using warpwise_reduce =
|
||||
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc,
|
||||
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
|
||||
|
||||
constexpr auto in_thread_copy_step =
|
||||
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
|
||||
|
||||
index_t indexOffset = 0;
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += warpSize * GredAccessesPerThreadInWarp)
|
||||
{
|
||||
threadwise_src_load.Run(
|
||||
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
|
||||
|
||||
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
|
||||
// done here
|
||||
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
|
||||
|
||||
// do the warp-wise reduction on data of all thread buffers
|
||||
warpwise_reduce::Reduce2(
|
||||
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset);
|
||||
|
||||
indexOffset += warpSize * GredAccessesPerThreadInWarp;
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the warp stores the reduced result to the global location
|
||||
// representing the Warp
|
||||
if(thread_inwarp_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(dst1dDesc,
|
||||
dst_global_val_buf,
|
||||
ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<3>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ ws_values_global,
|
||||
dstDataType beta,
|
||||
dstDataType* const __restrict__ p_dst_global,
|
||||
const int* const __restrict__ ws_indices_global,
|
||||
int* const __restrict__ indices_global)
|
||||
{
|
||||
(void)origReduceLen;
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
const auto src_global_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
|
||||
src2dDesc.GetElementSpaceSize(),
|
||||
type_convert<srcDataType>{}(zeroVal));
|
||||
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_indices_global, src2dDesc.GetElementSpaceSize());
|
||||
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_dst_global, dst1dDesc.GetElementSpaceSize());
|
||||
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
indices_global, dst1dDesc.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
|
||||
in_thread_val_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredAccessesPerThreadInWarp, true>
|
||||
in_thread_idx_buf;
|
||||
|
||||
using warpwise_reduce = WarpReduceWithIndicesInput<decltype(in_thread_val_buf),
|
||||
decltype(in_thread_idx_buf),
|
||||
BlockSize,
|
||||
opReduce,
|
||||
nanPropaOpt>;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
|
||||
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
|
||||
|
||||
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
|
||||
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
|
||||
|
||||
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc,
|
||||
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
|
||||
|
||||
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
|
||||
int,
|
||||
src2dDescType,
|
||||
decltype(ThreadBufferDesc),
|
||||
ThreadBufferLengths,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false>(
|
||||
src2dDesc,
|
||||
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
|
||||
|
||||
constexpr auto in_thread_copy_step =
|
||||
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
|
||||
|
||||
for(index_t reducedLength = 0; reducedLength < toReduceLength;
|
||||
reducedLength += warpSize * GredAccessesPerThreadInWarp)
|
||||
{
|
||||
threadwise_src_val_load.Run(src2dDesc,
|
||||
src_global_val_buf,
|
||||
ThreadBufferDesc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_val_buf);
|
||||
threadwise_src_idx_load.Run(src2dDesc,
|
||||
src_global_idx_buf,
|
||||
ThreadBufferDesc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_idx_buf);
|
||||
|
||||
// do the warp-wise reduction on data of all thread buffers
|
||||
warpwise_reduce::Reduce(
|
||||
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
|
||||
|
||||
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
// The first thread in the warp stores the reduced result to the global location
|
||||
// representing the Warp
|
||||
if(thread_inwarp_id == 0)
|
||||
{
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuValue_buf(I0) *= type_convert<compType>{}(alpha);
|
||||
|
||||
if(!float_equal_zero{}(beta))
|
||||
{
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<dstDataType,
|
||||
dstDataType,
|
||||
dst1dDescType,
|
||||
decltype(ReducedDataDesc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
|
||||
|
||||
threadwise_dst_load.Run(dst1dDesc,
|
||||
dst_global_val_buf,
|
||||
ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
priorDstValue_buf);
|
||||
|
||||
accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
|
||||
}
|
||||
|
||||
auto threadwise_dst_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
dstDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
auto threadwise_dst_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
dst1dDescType,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(dst1dDesc,
|
||||
make_multi_index(warp_global_1d_id));
|
||||
|
||||
threadwise_dst_val_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
|
||||
threadwise_dst_idx_store.Run(
|
||||
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,376 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
|
||||
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_blockwise.hpp"
|
||||
|
||||
#include "blockwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename srcDataType,
|
||||
typename dstDataType, // not used together with the beta input
|
||||
typename compType,
|
||||
typename src2dDescType,
|
||||
typename dst1dDescType,
|
||||
ReduceTensorOp_t op,
|
||||
NanPropagation_t nanPropaOpt,
|
||||
ReduceTensorIndices_t reduceIndicesOpt,
|
||||
index_t GredAccessesPerThreadInBlock>
|
||||
struct GridwiseReduction_xy_to_x_multiblock
|
||||
{
|
||||
using opReduce = typename reduce_binary_operator<compType, op>::opType;
|
||||
using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
|
||||
using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
|
||||
|
||||
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
|
||||
using blockwise_reduce =
|
||||
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
|
||||
|
||||
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <int RunId>
|
||||
__device__ static void Run(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
srcDataType* const __restrict__ ws_values_global,
|
||||
int* const __restrict__ ws_indices_global);
|
||||
|
||||
template <>
|
||||
__device__ static void Run<1>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
srcDataType* const __restrict__ ws_values_global,
|
||||
int* const __restrict__ ws_indices_global)
|
||||
{
|
||||
(void)ws_indices_global;
|
||||
|
||||
(void)alpha; // unused
|
||||
(void)beta; // unused
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
// LDS
|
||||
__shared__ compType p_in_block_buffer[BlockBufferSize];
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
|
||||
|
||||
auto in_block_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / BlkGroupSize;
|
||||
const index_t block_local_id = block_global_id % BlkGroupSize;
|
||||
|
||||
const index_t reduceSizePerBlock =
|
||||
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
|
||||
BlockBufferSize) *
|
||||
BlockBufferSize;
|
||||
|
||||
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
|
||||
|
||||
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
|
||||
using ThreadClusterLengths = Sequence<1, BlockSize>;
|
||||
|
||||
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(
|
||||
src2dDesc,
|
||||
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
|
||||
|
||||
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
|
||||
|
||||
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
|
||||
reducedBlocks += GredAccessesPerThreadInBlock)
|
||||
{
|
||||
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
|
||||
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
|
||||
__syncthreads();
|
||||
|
||||
// do element-wise pre-reduction operation
|
||||
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
|
||||
|
||||
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
|
||||
? GredAccessesPerThreadInBlock
|
||||
: toReduceBlocks - reducedBlocks;
|
||||
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
|
||||
|
||||
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
const auto workspace_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
|
||||
|
||||
// The first thread in the block stores the reduced result to the global location
|
||||
// representing the block
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
auto threadwise_workspace_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
srcDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
decltype(workspace_desc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(workspace_desc,
|
||||
make_multi_index(block_global_id));
|
||||
|
||||
threadwise_workspace_store.Run(ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
accuValue_buf,
|
||||
workspace_desc,
|
||||
workspace_global_buf);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ static void Run<2>(const src2dDescType& src2dDesc,
|
||||
const dst1dDescType& dst1dDesc,
|
||||
int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
srcDataType alpha,
|
||||
const srcDataType* const __restrict__ p_src_global,
|
||||
dstDataType beta,
|
||||
srcDataType* const __restrict__ ws_values_global,
|
||||
int* const __restrict__ ws_indices_global)
|
||||
{
|
||||
(void)alpha; // unused
|
||||
(void)beta; // unused
|
||||
|
||||
auto zeroVal = opReduce::GetZeroVal();
|
||||
|
||||
// LDS
|
||||
__shared__ compType p_in_block_values_buffer[BlockBufferSize];
|
||||
__shared__ int p_in_block_indices_buffer[BlockBufferSize];
|
||||
|
||||
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
|
||||
auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
|
||||
auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||
ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
|
||||
|
||||
auto in_block_val_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
|
||||
auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
|
||||
p_in_block_indices_buffer, BlockBufferSize);
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
|
||||
|
||||
accuValue_buf(I0) = zeroVal;
|
||||
accuIndex_buf(I0) = 0;
|
||||
|
||||
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
|
||||
const int divider = origReduceLen;
|
||||
|
||||
const preUnaryOpType preUnaryOp(divider);
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / BlkGroupSize;
|
||||
const index_t block_local_id = block_global_id % BlkGroupSize;
|
||||
|
||||
const index_t reduceSizePerBlock =
|
||||
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
|
||||
BlockBufferSize) *
|
||||
BlockBufferSize;
|
||||
|
||||
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
|
||||
|
||||
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
|
||||
using ThreadClusterLengths = Sequence<1, BlockSize>;
|
||||
|
||||
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
Sequence<1, BlockBufferSize>,
|
||||
ThreadSliceLengths,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
srcDataType,
|
||||
compType,
|
||||
src2dDescType,
|
||||
decltype(in_block_desc),
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
true>(
|
||||
src2dDesc,
|
||||
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
|
||||
in_block_desc,
|
||||
make_multi_index(0, 0));
|
||||
|
||||
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
|
||||
|
||||
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
|
||||
|
||||
int indexOffset = block_local_id * reduceSizePerBlock;
|
||||
|
||||
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
|
||||
reducedBlocks += GredAccessesPerThreadInBlock)
|
||||
{
|
||||
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
|
||||
|
||||
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
|
||||
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
|
||||
// done here
|
||||
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
|
||||
|
||||
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
|
||||
? GredAccessesPerThreadInBlock
|
||||
: toReduceBlocks - reducedBlocks;
|
||||
|
||||
blockwise_reduce::Reduce2(in_block_val_buf,
|
||||
in_block_idx_buf,
|
||||
BlocksInOneOp,
|
||||
accuValue_buf(I0),
|
||||
accuIndex_buf(I0));
|
||||
|
||||
indexOffset += BlockBufferSize;
|
||||
|
||||
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
|
||||
}
|
||||
|
||||
constexpr auto ReducedDataDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
const auto workspace_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
|
||||
|
||||
// The first thread in the block stores the reduced result to the global location
|
||||
// representing the block
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
auto threadwise_workspace_val_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<compType,
|
||||
srcDataType,
|
||||
decltype(ReducedDataDesc),
|
||||
decltype(workspace_desc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(workspace_desc,
|
||||
make_multi_index(block_global_id));
|
||||
|
||||
auto threadwise_workspace_idx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int,
|
||||
int,
|
||||
decltype(ReducedDataDesc),
|
||||
decltype(workspace_desc),
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
1,
|
||||
true>(workspace_desc,
|
||||
make_multi_index(block_global_id));
|
||||
|
||||
threadwise_workspace_val_store.Run(ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
accuValue_buf,
|
||||
workspace_desc,
|
||||
workspace_global_val_buf);
|
||||
threadwise_workspace_idx_store.Run(ReducedDataDesc,
|
||||
make_tuple(I0),
|
||||
accuIndex_buf,
|
||||
workspace_desc,
|
||||
workspace_global_idx_buf);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,271 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
|
||||
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_binop.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename buffer2dDescType,
|
||||
bool blockIsOneRow,
|
||||
typename opReduce,
|
||||
NanPropagation_t nanPropaOpt>
|
||||
struct BlockwiseReduction_2d_block_buffer
|
||||
{
|
||||
using compType = typename opReduce::dataType;
|
||||
|
||||
static constexpr auto buffer2dDesc = buffer2dDescType{};
|
||||
|
||||
static constexpr index_t BlockSize =
|
||||
blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
|
||||
static constexpr index_t NumBlocks =
|
||||
blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
|
||||
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
|
||||
|
||||
// This interface does not accumulate on indices
|
||||
template <typename BufferType>
|
||||
__device__ static void
|
||||
Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
|
||||
{
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
|
||||
index_t offset;
|
||||
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
|
||||
{
|
||||
offset = blockIsOneRow
|
||||
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
|
||||
compType opData = type_convert<compType>{}(block_buffer[offset]);
|
||||
|
||||
binop::calculate(lAccuData, opData);
|
||||
}
|
||||
|
||||
offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
|
||||
|
||||
block_buffer(offset) = lAccuData;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
|
||||
{
|
||||
if(thread_local_id < indOffset)
|
||||
{
|
||||
index_t offset1 =
|
||||
blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
|
||||
|
||||
index_t offset2 =
|
||||
blockIsOneRow
|
||||
? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
|
||||
|
||||
compType opData1 = type_convert<compType>{}(block_buffer[offset1]);
|
||||
compType opData2 = type_convert<compType>{}(block_buffer[offset2]);
|
||||
binop::calculate(opData1, opData2);
|
||||
block_buffer(offset1) = type_convert<compType>{}(opData1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
compType tmpVal = type_convert<compType>{}(block_buffer[0]);
|
||||
|
||||
binop::calculate(accuData, tmpVal);
|
||||
}
|
||||
};
|
||||
|
||||
// This interface accumulates on both data values and indices
|
||||
template <typename BufferType, typename IdxBufferType>
|
||||
__device__ static void Reduce2(BufferType& block_buffer,
|
||||
IdxBufferType& block_indices_buffer,
|
||||
index_t toReduceBlocks,
|
||||
compType& accuData,
|
||||
int& accuIndex)
|
||||
{
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
int lAccuIndex = 0;
|
||||
|
||||
if constexpr(blockIsOneRow)
|
||||
{
|
||||
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
|
||||
{
|
||||
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
|
||||
{
|
||||
if(thread_local_id % (indOffset * 2) == 0)
|
||||
{
|
||||
index_t offset1 =
|
||||
buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
|
||||
index_t offset2 = buffer2dDesc.CalculateOffset(
|
||||
make_tuple(otherDimInd, thread_local_id + indOffset));
|
||||
|
||||
compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
|
||||
compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
|
||||
int currIndex1 = block_indices_buffer[offset1];
|
||||
int currIndex2 = block_indices_buffer[offset2];
|
||||
|
||||
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
|
||||
block_buffer(offset1) = type_convert<compType>{}(currVal1);
|
||||
block_indices_buffer(offset1) = currIndex1;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
|
||||
{
|
||||
index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
|
||||
|
||||
compType tmpVal = type_convert<compType>{}(block_buffer[offset]);
|
||||
int tmpIndex = block_indices_buffer[offset];
|
||||
|
||||
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
|
||||
}
|
||||
|
||||
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
index_t offset;
|
||||
|
||||
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
|
||||
{
|
||||
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
|
||||
compType currVal = type_convert<compType>{}(block_buffer[offset]);
|
||||
int currIndex = block_indices_buffer[offset];
|
||||
|
||||
binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
|
||||
}
|
||||
|
||||
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
|
||||
|
||||
block_buffer(offset) = lAccuData;
|
||||
block_indices_buffer(offset) = lAccuIndex;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
|
||||
{
|
||||
if(thread_local_id % (indOffset * 2) == 0)
|
||||
{
|
||||
index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
|
||||
index_t offset2 =
|
||||
buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
|
||||
|
||||
compType currVal1 = type_convert<compType>{}(block_buffer[offset1]);
|
||||
compType currVal2 = type_convert<compType>{}(block_buffer[offset2]);
|
||||
int currIndex1 = block_indices_buffer[offset1];
|
||||
int currIndex2 = block_indices_buffer[offset2];
|
||||
|
||||
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
|
||||
block_buffer(offset1) = type_convert<compType>{}(currVal1);
|
||||
block_indices_buffer(offset1) = currIndex1;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(thread_local_id == 0)
|
||||
{
|
||||
compType tmpVal = type_convert<compType>{}(block_buffer[0]);
|
||||
int tmpIndex = block_indices_buffer[0];
|
||||
|
||||
binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename BufferType>
|
||||
__device__ static void set_buffer_value(BufferType& block_buffer, compType value)
|
||||
{
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
|
||||
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
|
||||
{
|
||||
index_t offset = blockIsOneRow
|
||||
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
|
||||
|
||||
block_buffer(offset) = value;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize the block-wise indices buffer, the index for each element in the block-wise data
|
||||
// buffer
|
||||
// is calculated according to its position in the buffer and the global starting index
|
||||
template <typename IdxBufferType>
|
||||
__device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
|
||||
{
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
|
||||
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
|
||||
{
|
||||
index_t offset = blockIsOneRow
|
||||
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
|
||||
|
||||
block_indices_buffer(offset) = offset + indexStart;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
};
|
||||
|
||||
// Execute unary operation on the block buffer elements
|
||||
template <typename unary_op_type, typename BufferType>
|
||||
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
|
||||
{
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
|
||||
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
|
||||
{
|
||||
index_t offset = blockIsOneRow
|
||||
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
|
||||
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
|
||||
|
||||
block_buffer(offset) = unary_op(block_buffer[offset]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
}; // end of namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,141 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
|
||||
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_binop.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
|
||||
struct ThreadReduce
|
||||
{
|
||||
using compType = typename opReduce::dataType;
|
||||
|
||||
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
|
||||
|
||||
static_assert(
|
||||
std::is_same<typename BufferType::type, compType>::value,
|
||||
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
|
||||
|
||||
static constexpr index_t ThreadBufferLen = BufferType::Size();
|
||||
|
||||
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
|
||||
|
||||
// This interface does not accumulate on indices
|
||||
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
|
||||
};
|
||||
|
||||
// This interface accumulates on both data values and indices and
|
||||
// is called by Direct_ThreadWise reduction method at first-time reduction
|
||||
__device__ static void
|
||||
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
int currIndex = I + indexStart;
|
||||
binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
|
||||
});
|
||||
};
|
||||
|
||||
// Set the elements in the per-thread buffer to a specific value
|
||||
// cppcheck-suppress constParameter
|
||||
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
|
||||
};
|
||||
|
||||
// Execute unary operation on the per-thread buffer elements
|
||||
template <typename unary_op_type>
|
||||
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
|
||||
};
|
||||
};
|
||||
|
||||
template <typename BufferType,
|
||||
typename IdxBufferType,
|
||||
typename opReduce,
|
||||
NanPropagation_t nanPropaOpt>
|
||||
struct ThreadReduceWithIndicesInput
|
||||
{
|
||||
using compType = typename opReduce::dataType;
|
||||
|
||||
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
|
||||
static_assert(IdxBufferType::IsStaticBuffer(),
|
||||
"Thread-wise reduction needs use StaticBuffer for indices!");
|
||||
|
||||
static_assert(
|
||||
std::is_same<typename BufferType::type, compType>::value,
|
||||
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
|
||||
static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
|
||||
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
|
||||
|
||||
static_assert(BufferType::Size() == IdxBufferType::Size(),
|
||||
"StaticBuffers for data and indices should have the same sizes!");
|
||||
|
||||
static constexpr index_t ThreadBufferLen = BufferType::Size();
|
||||
|
||||
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
|
||||
|
||||
// This interface accumulates on both data values and indices and
|
||||
// is called by Direct_ThreadWise reduction method at second-time reduction
|
||||
__device__ static void Reduce(const BufferType& thread_buffer,
|
||||
const IdxBufferType& thread_indices_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
|
||||
});
|
||||
};
|
||||
|
||||
// Set the elements in the per-thread buffer to a specific value
|
||||
// cppcheck-suppress constParameter
|
||||
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
|
||||
};
|
||||
|
||||
// Execute unary operation on the per-thread buffer elements
|
||||
template <typename unary_op_type>
|
||||
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
|
||||
};
|
||||
};
|
||||
|
||||
}; // end of namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,371 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
|
||||
#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_functions_binop.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
|
||||
struct WarpReduce
|
||||
{
|
||||
using compType = typename opReduce::dataType;
|
||||
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
|
||||
|
||||
static_assert(BufferType::IsStaticBuffer(),
|
||||
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
|
||||
static_assert(std::is_same<typename BufferType::type, compType>::value,
|
||||
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
|
||||
"the compType!");
|
||||
|
||||
static constexpr index_t ThreadBufferLen = BufferType::Size();
|
||||
static constexpr bool have_builtin_shuffle =
|
||||
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
|
||||
|
||||
// This interface does not accumulate on indices
|
||||
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
|
||||
{
|
||||
if constexpr(have_builtin_shuffle)
|
||||
ReduceImpl1(thread_buffer, accuData);
|
||||
else
|
||||
ReduceImpl2(thread_buffer, accuData);
|
||||
};
|
||||
|
||||
// This interface implementation uses HIP built-in device shuffling functions
|
||||
__device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
|
||||
|
||||
// synchronize among all threads in this warp
|
||||
__all(1);
|
||||
|
||||
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
|
||||
{
|
||||
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
|
||||
binop::calculate(lAccuData, tmpVal);
|
||||
__all(1);
|
||||
}
|
||||
|
||||
binop::calculate(accuData, lAccuData);
|
||||
};
|
||||
|
||||
// This interface implementation does not use HIP built-in device shuffling functions
|
||||
// since for fp16, built-in shuffling functions is not provided by HIP
|
||||
__device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
|
||||
|
||||
__syncthreads();
|
||||
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
index_t warpId = thread_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_id % warpSize;
|
||||
|
||||
__shared__ compType shuffle_buffer[BlockSize];
|
||||
|
||||
compType* myBuffer = &shuffle_buffer[warpId * warpSize];
|
||||
|
||||
myBuffer[thread_inwarp_id] = lAccuData;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
|
||||
{
|
||||
if(thread_inwarp_id < stride)
|
||||
{
|
||||
compType currVal1 = myBuffer[thread_inwarp_id];
|
||||
compType currVal2 = myBuffer[thread_inwarp_id + stride];
|
||||
|
||||
binop::calculate(currVal1, currVal2);
|
||||
|
||||
myBuffer[thread_inwarp_id] = currVal1;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if(thread_inwarp_id == 0)
|
||||
binop::calculate(accuData, myBuffer[0]);
|
||||
};
|
||||
|
||||
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
|
||||
// reduction method at first-time reduction
|
||||
__device__ static void
|
||||
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
|
||||
{
|
||||
if constexpr(have_builtin_shuffle)
|
||||
Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
|
||||
else
|
||||
Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
|
||||
};
|
||||
|
||||
// This interface implementation uses HIP built-in device shuffling functions
|
||||
__device__ static void Reduce2Impl1(const BufferType& thread_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex,
|
||||
int indexStart)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
int lAccuIndex = 0;
|
||||
index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
|
||||
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
|
||||
});
|
||||
|
||||
// synchronize among all threads in this warp
|
||||
__all(1);
|
||||
|
||||
for(index_t stride = 1; stride < warpSize; stride *= 2)
|
||||
{
|
||||
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
|
||||
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
|
||||
|
||||
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
|
||||
__all(1);
|
||||
}
|
||||
|
||||
if(thread_inwarp_id == 0)
|
||||
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
|
||||
};
|
||||
|
||||
// This interface implementation does not use HIP built-in device shuffling functions since for
|
||||
// fp16, built-in shuffling functions is not provided by HIP
|
||||
__device__ static void Reduce2Impl2(const BufferType& thread_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex,
|
||||
int indexStart)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
int lAccuIndex = 0;
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
index_t warpId = thread_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_id % warpSize;
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
|
||||
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
|
||||
});
|
||||
|
||||
__shared__ compType shuffle_data_buffer[BlockSize];
|
||||
__shared__ int shuffle_indices_buffer[BlockSize];
|
||||
|
||||
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
|
||||
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
|
||||
|
||||
myDataBuffer[thread_inwarp_id] = lAccuData;
|
||||
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t stride = 1; stride < warpSize; stride *= 2)
|
||||
{
|
||||
compType currVal1 = myDataBuffer[thread_inwarp_id];
|
||||
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
|
||||
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
|
||||
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
|
||||
|
||||
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
|
||||
|
||||
myDataBuffer[thread_inwarp_id] = currVal1;
|
||||
myIndicesBuffer[thread_inwarp_id] = currIndex1;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(thread_inwarp_id == 0)
|
||||
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
|
||||
};
|
||||
|
||||
// cppcheck-suppress constParameter
|
||||
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
|
||||
|
||||
__all(1);
|
||||
};
|
||||
|
||||
// Execute unary operation on the per-thread buffer elements
|
||||
template <typename unary_op_type>
|
||||
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
|
||||
|
||||
__all(1);
|
||||
};
|
||||
};
|
||||
|
||||
template <typename BufferType,
|
||||
typename IdxBufferType,
|
||||
index_t BlockSize,
|
||||
typename opReduce,
|
||||
NanPropagation_t nanPropaOpt>
|
||||
struct WarpReduceWithIndicesInput
|
||||
{
|
||||
using compType = typename opReduce::dataType;
|
||||
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
|
||||
|
||||
static_assert(BufferType::IsStaticBuffer(),
|
||||
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
|
||||
static_assert(IdxBufferType::IsStaticBuffer(),
|
||||
"Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
|
||||
|
||||
static_assert(std::is_same<typename BufferType::type, compType>::value,
|
||||
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
|
||||
"the compType!");
|
||||
static_assert(
|
||||
std::is_same<typename IdxBufferType::type, index_t>::value,
|
||||
"Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
|
||||
|
||||
static_assert(BufferType::Size() == IdxBufferType::Size(),
|
||||
"StaticBuffers for data and indices should have the same sizes!");
|
||||
|
||||
static constexpr index_t ThreadBufferLen = BufferType::Size();
|
||||
static constexpr bool have_builtin_shuffle =
|
||||
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
|
||||
|
||||
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
|
||||
// reduction method at second-time reduction
|
||||
__device__ static void Reduce(const BufferType& thread_buffer,
|
||||
const IdxBufferType& thread_indices_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex)
|
||||
{
|
||||
if constexpr(have_builtin_shuffle)
|
||||
ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
|
||||
else
|
||||
ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
|
||||
};
|
||||
|
||||
// This interface implementation uses HIP built-in device shuffling functions
|
||||
__device__ static void ReduceImpl1(const BufferType& thread_buffer,
|
||||
const IdxBufferType& thread_indices_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
int lAccuIndex = 0;
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
|
||||
});
|
||||
|
||||
// synchronize among all threads in this warp
|
||||
__all(1);
|
||||
|
||||
for(index_t stride = 1; stride < warpSize; stride *= 2)
|
||||
{
|
||||
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
|
||||
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
|
||||
|
||||
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
|
||||
__all(1);
|
||||
}
|
||||
|
||||
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
|
||||
};
|
||||
|
||||
// This interface implementation does not use HIP built-in device shuffling functions
|
||||
// since for fp16, built-in shuffling functions is not provided by HIP
|
||||
__device__ static void ReduceImpl2(const BufferType& thread_buffer,
|
||||
const IdxBufferType& thread_indices_buffer,
|
||||
compType& accuData,
|
||||
int& accuIndex)
|
||||
{
|
||||
compType lAccuData = opReduce::GetZeroVal();
|
||||
int lAccuIndex = 0;
|
||||
index_t thread_id = get_thread_local_1d_id();
|
||||
index_t warpId = thread_id / warpSize;
|
||||
index_t thread_inwarp_id = thread_id % warpSize;
|
||||
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
|
||||
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
|
||||
});
|
||||
|
||||
__shared__ compType shuffle_data_buffer[BlockSize];
|
||||
__shared__ int shuffle_indices_buffer[BlockSize];
|
||||
|
||||
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
|
||||
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
|
||||
|
||||
myDataBuffer[thread_inwarp_id] = lAccuData;
|
||||
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t stride = 1; stride < warpSize; stride *= 2)
|
||||
{
|
||||
compType currVal1 = myDataBuffer[thread_inwarp_id];
|
||||
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
|
||||
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
|
||||
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
|
||||
|
||||
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
|
||||
|
||||
myDataBuffer[thread_inwarp_id] = currVal1;
|
||||
myIndicesBuffer[thread_inwarp_id] = currIndex1;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(thread_inwarp_id == 0)
|
||||
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
|
||||
};
|
||||
|
||||
// cppcheck-suppress constParameter
|
||||
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
|
||||
|
||||
__all(1);
|
||||
};
|
||||
|
||||
// Execute unary operation on the per-thread buffer elements
|
||||
template <typename unary_op_type>
|
||||
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
|
||||
{
|
||||
static_for<0, ThreadBufferLen, 1>{}(
|
||||
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
|
||||
|
||||
__all(1);
|
||||
};
|
||||
};
|
||||
|
||||
}; // end of namespace ck
|
||||
|
||||
#endif
|
||||
@@ -713,9 +713,6 @@ struct ThreadwiseTensorSliceTransfer_v3
|
||||
: src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
|
||||
dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
|
||||
{
|
||||
// TODO: fix this
|
||||
static_assert(is_same<SrcData, DstData>::value,
|
||||
"wrong! current implementation assume SrcData and DstData are same type");
|
||||
}
|
||||
|
||||
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
|
||||
@@ -985,7 +982,8 @@ struct ThreadwiseTensorSliceTransfer_v3
|
||||
constexpr index_t buffer_offset =
|
||||
buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
|
||||
|
||||
dst_tmp_vector.template AsType<DstData>()(i) = buffer_[Number<buffer_offset>{}];
|
||||
dst_tmp_vector.template AsType<DstData>()(i) =
|
||||
type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
|
||||
});
|
||||
|
||||
using dst_vector_t = typename decltype(dst_tmp_vector)::type;
|
||||
|
||||
@@ -38,6 +38,10 @@ struct DynamicBuffer
|
||||
return BufferAddressSpace;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
|
||||
|
||||
__host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
|
||||
|
||||
template <typename X,
|
||||
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
|
||||
typename scalar_type<remove_cvref_t<T>>::type>::value,
|
||||
|
||||
104
composable_kernel/include/utility/reduction_common.hpp
Normal file
104
composable_kernel/include/utility/reduction_common.hpp
Normal file
@@ -0,0 +1,104 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_COMMON_HPP
|
||||
#define CK_REDUCTION_COMMON_HPP
|
||||
|
||||
// this enumerate should be synchronized with include/miopen/reduce_common.hpp
|
||||
namespace ck {
|
||||
enum class ReductionMethod_t
|
||||
{
|
||||
DirectThreadWise = 1,
|
||||
DirectWarpWise = 2,
|
||||
BlockWise = 3,
|
||||
MultiBlock = 4
|
||||
}; // end of namespace ck
|
||||
|
||||
enum class ReduceTensorOp_t
|
||||
{
|
||||
ADD = 0,
|
||||
MUL = 1,
|
||||
MIN = 2,
|
||||
MAX = 3,
|
||||
AMAX = 4,
|
||||
AVG = 5,
|
||||
NORM1 = 6,
|
||||
NORM2 = 7,
|
||||
// MUL_NO_ZEROS = 8,
|
||||
};
|
||||
|
||||
enum class NanPropagation_t
|
||||
{
|
||||
NOT_PROPAGATE_NAN = 0,
|
||||
PROPAGATE_NAN = 1,
|
||||
};
|
||||
|
||||
enum class ReduceTensorIndices_t
|
||||
{
|
||||
NO_INDICES = 0,
|
||||
FLATTENED_INDICES = 1,
|
||||
};
|
||||
|
||||
enum class IndicesType_t
|
||||
{
|
||||
INDICES_32BIT = 0,
|
||||
INDICES_64BIT = 1,
|
||||
INDICES_16BIT = 2,
|
||||
INDICES_8BIT = 3,
|
||||
};
|
||||
|
||||
struct float_equal_one
|
||||
{
|
||||
template <class T>
|
||||
__device__ static inline bool apply(T x)
|
||||
{
|
||||
return x <= type_convert<T>{}(1.0f) and x >= type_convert<T>{}(1.0f);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
{
|
||||
return (float_equal_one::apply(x));
|
||||
};
|
||||
};
|
||||
|
||||
struct float_equal_zero
|
||||
{
|
||||
template <class T>
|
||||
__device__ static inline bool apply(T x)
|
||||
{
|
||||
return x <= type_convert<T>{}(0.0f) and x >= type_convert<T>{}(0.0f);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
{
|
||||
return (float_equal_zero::apply(x));
|
||||
};
|
||||
};
|
||||
|
||||
}; // end of namespace ck
|
||||
|
||||
#endif
|
||||
100
composable_kernel/include/utility/reduction_functions_binop.hpp
Normal file
100
composable_kernel/include/utility/reduction_functions_binop.hpp
Normal file
@@ -0,0 +1,100 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
|
||||
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace detail {
|
||||
|
||||
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
|
||||
|
||||
template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
|
||||
struct binop_with_nan_check;
|
||||
|
||||
template <typename opReduce, typename compType>
|
||||
struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
|
||||
{
|
||||
// cppcheck-suppress constParameter
|
||||
__device__ static inline void calculate(compType& accuVal, compType currVal)
|
||||
{
|
||||
opReduce{}(accuVal, currVal);
|
||||
};
|
||||
|
||||
// The method is called when the opReduce is indexable and the user asked for indices
|
||||
__device__ static inline void
|
||||
// cppcheck-suppress constParameter
|
||||
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
|
||||
{
|
||||
bool changed = false;
|
||||
|
||||
opReduce{}(accuVal, currVal, changed);
|
||||
|
||||
if(changed)
|
||||
accuIndex = currIndex;
|
||||
};
|
||||
};
|
||||
|
||||
template <typename opReduce, typename compType>
|
||||
struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
|
||||
{
|
||||
__device__ static inline void calculate(compType& accuVal, compType currVal)
|
||||
{
|
||||
if(isnan(currVal))
|
||||
accuVal = currVal;
|
||||
else
|
||||
opReduce{}(accuVal, currVal);
|
||||
};
|
||||
|
||||
// The method is called when the opReduce is indexable and the user asked for indices
|
||||
__device__ static inline void
|
||||
calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
|
||||
{
|
||||
if(isnan(currVal))
|
||||
{
|
||||
accuVal = currVal;
|
||||
accuIndex = currIndex;
|
||||
}
|
||||
else
|
||||
{
|
||||
bool changed = false;
|
||||
|
||||
opReduce{}(accuVal, currVal, changed);
|
||||
|
||||
if(changed)
|
||||
accuIndex = currIndex;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
}; // namespace detail
|
||||
}; // end of namespace ck
|
||||
|
||||
#endif
|
||||
420
composable_kernel/include/utility/reduction_operator.hpp
Normal file
420
composable_kernel/include/utility/reduction_operator.hpp
Normal file
@@ -0,0 +1,420 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_OPERATOR_HPP
|
||||
#define CK_REDUCTION_OPERATOR_HPP
|
||||
|
||||
#include "reduction_common.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
namespace reduce {
|
||||
|
||||
// Every binary operator used in reduction is represented by a templated functor class. Each functor
|
||||
// class must provide at least
|
||||
// three members:
|
||||
// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
|
||||
// "identity element" is the unique
|
||||
// element in the algebraic space that doesn't affect the value of other elements
|
||||
// when operated with any of them.
|
||||
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
|
||||
// recorded. Usually, Min/Max operator could
|
||||
// need to record the indices of elements. For operator like Add/Mul, no need to
|
||||
// record the indices.
|
||||
// 3) operator() -- the first argument of the operator must be both an input & output, and the
|
||||
// corresponding variable usually stores
|
||||
// the accumulated result of many operator() calls; the second argument is only an
|
||||
// input. For indexable binary
|
||||
// operator, the second version of operator() has third argument (which is an
|
||||
// output) to indicate whether the
|
||||
// accumulated value (the first argument) has changed, in which case the recorded
|
||||
// accumulated index also need be
|
||||
// changed.
|
||||
|
||||
template <class T>
|
||||
struct Add
|
||||
{
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return type_convert<T>{}(0.0f); };
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct Mul
|
||||
{
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return type_convert<T>{}(1.0f); };
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct Max
|
||||
{
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return std::numeric_limits<T>::min(); };
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b) const
|
||||
{
|
||||
if(a < b)
|
||||
a = b;
|
||||
}
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
|
||||
{
|
||||
if(a < b)
|
||||
{
|
||||
a = b;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr bool indexable = true;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct Min
|
||||
{
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return std::numeric_limits<T>::max(); };
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b) const
|
||||
{
|
||||
if(a > b)
|
||||
a = b;
|
||||
}
|
||||
|
||||
__device__ inline constexpr void operator()(T& a, T b, bool& changed) const
|
||||
{
|
||||
if(a > b)
|
||||
{
|
||||
a = b;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr bool indexable = true;
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ half_t Max<half_t>::GetZeroVal()
|
||||
{
|
||||
return type_convert<half_t>{}(std::numeric_limits<float>::min());
|
||||
};
|
||||
|
||||
template <>
|
||||
__device__ half_t Min<half_t>::GetZeroVal()
|
||||
{
|
||||
return type_convert<half_t>{}(std::numeric_limits<float>::max());
|
||||
};
|
||||
|
||||
// Unary operators are usually called element-wisely before the reduction is executed on the
|
||||
// elements.
|
||||
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
|
||||
template <class T, bool hasDividing>
|
||||
struct unary_identic
|
||||
{
|
||||
__device__ unary_identic(const int divider = 1)
|
||||
{
|
||||
scaler = 1.0f / static_cast<float>(divider);
|
||||
};
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const { return a * type_convert<T>{}(scaler); };
|
||||
|
||||
float scaler = 1.0f;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct unary_identic<T, false>
|
||||
{
|
||||
__device__ unary_identic(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const { return a; };
|
||||
};
|
||||
|
||||
template <class T, bool hasDividing>
|
||||
struct unary_square
|
||||
{
|
||||
__device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const
|
||||
{
|
||||
a = a * a;
|
||||
|
||||
return a * type_convert<T>{}(scaler);
|
||||
};
|
||||
|
||||
float scaler = 1.0f;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct unary_square<T, false>
|
||||
{
|
||||
__device__ unary_square(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const { return a * a; };
|
||||
};
|
||||
|
||||
template <class T, bool hasDividing>
|
||||
struct unary_abs
|
||||
{
|
||||
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const
|
||||
{
|
||||
a = abs(a);
|
||||
|
||||
return a * type_convert<T>{}(scaler);
|
||||
};
|
||||
|
||||
float scaler = 1.0f;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct unary_abs<T, false>
|
||||
{
|
||||
__device__ unary_abs(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline constexpr T operator()(T a) const { return abs(a); };
|
||||
};
|
||||
|
||||
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
|
||||
// Let's assume that __habs() exists since 3.5.
|
||||
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
|
||||
inline __device__ __half __habs(__half x)
|
||||
{
|
||||
union
|
||||
{
|
||||
__half half;
|
||||
unsigned short u16;
|
||||
} val;
|
||||
val.half = x;
|
||||
val.u16 = val.u16 & 0x7fff;
|
||||
return val.half;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool hasDividing>
|
||||
struct unary_abs<half_t, hasDividing>
|
||||
{
|
||||
__device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
|
||||
|
||||
__device__ inline half_t operator()(half_t a) const
|
||||
{
|
||||
a = static_cast<half_t>(__habs(a));
|
||||
|
||||
return a * type_convert<half_t>{}(scaler);
|
||||
};
|
||||
|
||||
float scaler = 1.0f;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unary_abs<half_t, false>
|
||||
{
|
||||
__device__ unary_abs(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct unary_sqrt
|
||||
{
|
||||
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline T operator()(T a) const { return sqrtf(a); };
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unary_sqrt<half_t>
|
||||
{
|
||||
__device__ unary_sqrt(const int divider = 1) { (void)divider; };
|
||||
|
||||
__device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
|
||||
};
|
||||
|
||||
}; // end of namespace reduce
|
||||
|
||||
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
|
||||
// respective functor classes.
|
||||
// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
|
||||
// reduce_binary_operactor for
|
||||
// easier checking by the upper-layer codes in the kernels.
|
||||
|
||||
template <typename T, ReduceTensorOp_t op>
|
||||
struct reduce_binary_operator;
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Add<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
|
||||
{
|
||||
using opType = reduce::Mul<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Mul<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Mul<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
|
||||
{
|
||||
using opType = reduce::Min<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Min<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Min<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
|
||||
{
|
||||
using opType = reduce::Max<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Max<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
|
||||
{
|
||||
using opType = reduce::Max<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Max<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Max<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Add<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Add<T>::indexable;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
__device__ static T GetZeroVal() { return reduce::Add<T>::GetZeroVal(); };
|
||||
|
||||
static constexpr bool indexable = reduce::Add<T>::indexable;
|
||||
};
|
||||
|
||||
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
|
||||
// functor classes.
|
||||
// The two unary functors are called before and afer the Reduction is executed respectively
|
||||
template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
|
||||
struct reduce_unary_operator
|
||||
{
|
||||
using preUnaryOp = reduce::unary_identic<T, false>;
|
||||
using posUnaryOp = reduce::unary_identic<T, false>;
|
||||
};
|
||||
|
||||
template <typename T, bool isFirstReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_identic<T, false>;
|
||||
using posUnaryOp = reduce::unary_identic<T, true>;
|
||||
};
|
||||
|
||||
template <typename T, bool isLastReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_abs<T, false>;
|
||||
using posUnaryOp = reduce::unary_identic<T, false>;
|
||||
};
|
||||
|
||||
template <typename T, bool isLastReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_abs<T, false>;
|
||||
using posUnaryOp = reduce::unary_identic<T, false>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_square<T, false>;
|
||||
using posUnaryOp = reduce::unary_identic<T, false>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_square<T, false>;
|
||||
using posUnaryOp = reduce::unary_sqrt<T>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
|
||||
{
|
||||
using preUnaryOp = reduce::unary_identic<T, false>;
|
||||
using posUnaryOp = reduce::unary_sqrt<T>;
|
||||
};
|
||||
|
||||
} // end of namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,317 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_blockwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)GridSize;
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto one_dim_srcDesc = transform_tensor_descriptor(
|
||||
srcDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
auto src2dDesc = transform_tensor_descriptor(
|
||||
one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pass_through_transform(invariantLen),
|
||||
make_pad_transform(toReduceLen, 0, srcPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
transform_tensor_descriptor(ref_one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(
|
||||
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the BlockWise and MultiBlock method
|
||||
using refType_src2dDesc_padded_34 = decltype(
|
||||
transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pass_through_transform(ref_invariantLen),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_34 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
|
||||
|
||||
template <ReductionMethod_t impl, bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredAccessesPerThreadInBlock>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,318 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_blockwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)GridSize;
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
|
||||
|
||||
auto src2dDesc =
|
||||
transform_tensor_descriptor(srcDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pass_through_transform(invariantLen),
|
||||
make_pad_transform(toReduceLen, 0, srcPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_toReduceDimLengths =
|
||||
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
|
||||
static constexpr auto ref_invariantDimLengths =
|
||||
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
|
||||
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
|
||||
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the BlockWise and MultiBlock method
|
||||
using refType_src2dDesc_padded_34 = decltype(
|
||||
transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pass_through_transform(ref_invariantLen),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_34 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_34;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredAccessesPerThreadInBlock>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,323 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_multiblock.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)GridSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto one_dim_srcDesc = transform_tensor_descriptor(
|
||||
srcDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
auto src2dDesc = transform_tensor_descriptor(
|
||||
one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
|
||||
const index_t reduceSizePerBlock =
|
||||
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
|
||||
copySliceLen;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pass_through_transform(invariantLen),
|
||||
make_pad_transform(toReduceLen, 0, srcPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
transform_tensor_descriptor(ref_one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(
|
||||
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the BlockWise and MultiBlock method
|
||||
using refType_src2dDesc_padded_34 = decltype(
|
||||
transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pass_through_transform(ref_invariantLen),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_34 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_34;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)p_dst_global;
|
||||
(void)indices_global;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
GredAccessesPerThreadInBlock>;
|
||||
|
||||
void* const ws_buf2_global =
|
||||
ws_buf2_bytes_offset > 0
|
||||
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
|
||||
: nullptr;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
BlkGroupSize,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
|
||||
static_cast<int* const __restrict__>(ws_buf2_global));
|
||||
};
|
||||
@@ -0,0 +1,323 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_multiblock.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)GridSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
|
||||
|
||||
auto src2dDesc =
|
||||
transform_tensor_descriptor(srcDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
|
||||
const index_t reduceSizePerBlock =
|
||||
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
|
||||
copySliceLen;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pass_through_transform(invariantLen),
|
||||
make_pad_transform(toReduceLen, 0, srcPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_toReduceDimLengths =
|
||||
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
|
||||
static constexpr auto ref_invariantDimLengths =
|
||||
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
|
||||
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
|
||||
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the BlockWise and MultiBlock method
|
||||
using refType_src2dDesc_padded_34 = decltype(
|
||||
transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pass_through_transform(ref_invariantLen),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_34 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_34;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)p_dst_global;
|
||||
(void)indices_global;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
GredAccessesPerThreadInBlock>;
|
||||
|
||||
void* const ws_buf2_global =
|
||||
ws_buf2_bytes_offset > 0
|
||||
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
|
||||
: nullptr;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
BlkGroupSize,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
|
||||
static_cast<int* const __restrict__>(ws_buf2_global));
|
||||
};
|
||||
@@ -0,0 +1,330 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto one_dim_srcDesc = transform_tensor_descriptor(
|
||||
srcDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
auto src2dDesc = transform_tensor_descriptor(
|
||||
one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = GredThreadBufferLength;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
transform_tensor_descriptor(ref_one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(
|
||||
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredThreadBufferLength>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,331 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
|
||||
|
||||
auto src2dDesc =
|
||||
transform_tensor_descriptor(srcDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = GredThreadBufferLength;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_toReduceDimLengths =
|
||||
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
|
||||
static constexpr auto ref_invariantDimLengths =
|
||||
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
|
||||
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
|
||||
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredThreadBufferLength>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,332 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<Sequence<>, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto one_dim_srcDesc = transform_tensor_descriptor(
|
||||
srcDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
auto src2dDesc = transform_tensor_descriptor(
|
||||
one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 1>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
transform_tensor_descriptor(ref_one_dim_srcDesc,
|
||||
make_tuple(make_unmerge_transform(
|
||||
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, toReduceDims>::refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce =
|
||||
GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredAccessesPerThreadInWarp>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,333 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>;
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int inLength0,
|
||||
int inLength1,
|
||||
int inLength2,
|
||||
int inLength3,
|
||||
int inLength4,
|
||||
int inLength5,
|
||||
int inStride0,
|
||||
int inStride1,
|
||||
int inStride2,
|
||||
int inStride3,
|
||||
int inStride4,
|
||||
int inStride5,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
|
||||
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
|
||||
|
||||
auto src2dDesc =
|
||||
transform_tensor_descriptor(srcDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
|
||||
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_toReduceDimLengths =
|
||||
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
|
||||
static constexpr auto ref_invariantDimLengths =
|
||||
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
|
||||
|
||||
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
|
||||
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
|
||||
|
||||
// don't have to use accurate strides to get an expected referrence type
|
||||
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
|
||||
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
|
||||
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
|
||||
|
||||
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
|
||||
ref_srcDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
|
||||
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
|
||||
make_tuple(invariantDims{}, toReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
|
||||
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
|
||||
int BlkGroupSize,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
(void)ws_buf2_bytes_offset;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce =
|
||||
GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
true,
|
||||
true,
|
||||
GredAccessesPerThreadInWarp>;
|
||||
|
||||
constexpr int RunId = need_indices ? 2 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(p_src_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(nullptr),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,282 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_blockwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)GridSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
|
||||
const index_t toReduceLen = BlkGroupSize;
|
||||
|
||||
auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
|
||||
|
||||
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pass_through_transform(invariantLen),
|
||||
make_pad_transform(toReduceLen, 0, srcPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_tupleDstLengths =
|
||||
make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
|
||||
static constexpr auto ref_dstDesc =
|
||||
make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(ref_tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
|
||||
static constexpr index_t ref_toReduceLen = 8;
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
|
||||
// used by the BlockWise and MultiBlock method
|
||||
using refType_src2dDesc_padded_34 = decltype(
|
||||
transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pass_through_transform(ref_invariantLen),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_34 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_34;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)p_src_global;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
false,
|
||||
true,
|
||||
GredAccessesPerThreadInBlock>;
|
||||
|
||||
void* const ws_buf2_global =
|
||||
ws_buf2_bytes_offset > 0
|
||||
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
|
||||
: nullptr;
|
||||
|
||||
constexpr int RunId = need_indices ? 3 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(ws_buf2_global),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,296 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
|
||||
const index_t toReduceLen = BlkGroupSize;
|
||||
|
||||
auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
|
||||
|
||||
constexpr auto copySliceLen = GredThreadBufferLength;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_tupleDstLengths =
|
||||
make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
|
||||
static constexpr auto ref_dstDesc =
|
||||
make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(ref_tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
|
||||
static constexpr index_t ref_toReduceLen = 8;
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)p_src_global;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
false,
|
||||
true,
|
||||
GredThreadBufferLength>;
|
||||
|
||||
void* const ws_buf2_global =
|
||||
ws_buf2_bytes_offset > 0
|
||||
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
|
||||
: nullptr;
|
||||
|
||||
constexpr int RunId = need_indices ? 3 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(ws_buf2_global),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
@@ -0,0 +1,298 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#include "config.hpp"
|
||||
#include "number.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "data_type_enum_helper.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
using srcDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
|
||||
using dstDataType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
|
||||
using compType =
|
||||
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
|
||||
|
||||
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
|
||||
|
||||
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
|
||||
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
|
||||
|
||||
using toReduceDims = Sequence<CK_PARAM_TOREDUCE_DIMS>;
|
||||
using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
|
||||
|
||||
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
|
||||
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
|
||||
? NanPropagation_t::NOT_PROPAGATE_NAN
|
||||
: NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
|
||||
? ReduceTensorIndices_t::NO_INDICES
|
||||
: ReduceTensorIndices_t::FLATTENED_INDICES;
|
||||
|
||||
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
|
||||
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
using specDims = typename sequence_merge<invariantDims, toReduceDims>::type;
|
||||
|
||||
static_assert(is_valid_sequence_map<specDims>::value && specDims::Size() == srcDims,
|
||||
"Wrong invariant and/or toReduce dimensions!");
|
||||
|
||||
// The number of invariant dimensions can be zero if all dimension are to be reduced
|
||||
static_assert(invariantDims::Size() > 0 || dstDims == 1,
|
||||
"If all source dimensions are reduced, the dest should have only one dimension !!");
|
||||
|
||||
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
|
||||
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t... Ns>
|
||||
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(Ns...);
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
|
||||
int BlkGroupSize,
|
||||
int outLength0,
|
||||
int outLength1,
|
||||
int outLength2,
|
||||
int outLength3,
|
||||
int outLength4,
|
||||
int outLength5,
|
||||
int outStride0,
|
||||
int outStride1,
|
||||
int outStride2,
|
||||
int outStride3,
|
||||
int outStride4,
|
||||
int outStride5,
|
||||
void* __restrict__ ws_global)
|
||||
{
|
||||
(void)BlkGroupSize;
|
||||
|
||||
void* p_src2dDesc = ws_global;
|
||||
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
|
||||
const int dstLengths[6] = {
|
||||
outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
|
||||
const int dstStrides[6] = {
|
||||
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
|
||||
|
||||
const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
|
||||
|
||||
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto dst1dDesc = transform_tensor_descriptor(
|
||||
dstDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
|
||||
const index_t toReduceLen = BlkGroupSize;
|
||||
|
||||
auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
|
||||
|
||||
constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
|
||||
|
||||
if constexpr(src2d_need_padding)
|
||||
{
|
||||
const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
|
||||
const auto srcPad2 =
|
||||
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
|
||||
|
||||
auto src2dDesc_2 =
|
||||
transform_tensor_descriptor(src2dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
|
||||
make_pad_transform(toReduceLen, 0, srcPad2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
|
||||
}
|
||||
|
||||
if constexpr(dst1d_need_padding)
|
||||
{
|
||||
const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
|
||||
auto dst1dDesc_2 =
|
||||
transform_tensor_descriptor(dst1dDesc,
|
||||
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(hipThreadIdx_x == 0)
|
||||
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
|
||||
struct get_ref_desc_types
|
||||
{
|
||||
static constexpr auto ref_tupleDstLengths =
|
||||
make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
|
||||
static constexpr auto ref_dstDesc =
|
||||
make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
|
||||
|
||||
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
|
||||
ref_dstDesc,
|
||||
make_tuple(make_merge_transform(ref_tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
|
||||
static constexpr index_t ref_toReduceLen = 8;
|
||||
|
||||
static constexpr auto ref_src2dDesc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
|
||||
|
||||
using refType_src2dDesc = decltype(ref_src2dDesc);
|
||||
using refType_dst1dDesc = decltype(ref_dst1dDesc);
|
||||
|
||||
// used by the DirectThreadWise and DirectWarpWise method
|
||||
using refType_src2dDesc_padded_12 =
|
||||
decltype(transform_tensor_descriptor(ref_src2dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
|
||||
make_pad_transform(ref_toReduceLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{})));
|
||||
|
||||
using refType_dst1dDesc_padded =
|
||||
decltype(transform_tensor_descriptor(ref_dst1dDesc,
|
||||
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{})));
|
||||
};
|
||||
|
||||
using refType_src2dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
|
||||
using refType_dst1dDesc =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
|
||||
using refType_src2dDesc_padded_12 =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_src2dDesc_padded_12;
|
||||
using refType_dst1dDesc_padded =
|
||||
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
|
||||
refType_dst1dDesc_padded;
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
|
||||
};
|
||||
|
||||
template <bool need_padding>
|
||||
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
|
||||
{
|
||||
if constexpr(need_padding)
|
||||
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
|
||||
else
|
||||
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
|
||||
};
|
||||
|
||||
extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
|
||||
float alpha,
|
||||
const void* __restrict__ p_src_global,
|
||||
float beta,
|
||||
void* __restrict__ p_dst_global,
|
||||
void* __restrict__ ws_global,
|
||||
long ws_buf2_bytes_offset,
|
||||
void* __restrict__ indices_global)
|
||||
{
|
||||
(void)p_src_global;
|
||||
|
||||
const void* p_src2dDesc = ws_global;
|
||||
const void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
|
||||
void* ws_buf1_global = static_cast<char*>(ws_global) + 4096;
|
||||
|
||||
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
|
||||
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
|
||||
|
||||
using gridwise_2d_reduce =
|
||||
GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
|
||||
srcDataType,
|
||||
dstDataType,
|
||||
compType,
|
||||
decltype(src2dDesc),
|
||||
decltype(dst1dDesc),
|
||||
op,
|
||||
nanPropaOpt,
|
||||
reduceIndicesOpt,
|
||||
false,
|
||||
true,
|
||||
GredAccessesPerThreadInWarp>;
|
||||
|
||||
void* const ws_buf2_global =
|
||||
ws_buf2_bytes_offset > 0
|
||||
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
|
||||
: nullptr;
|
||||
|
||||
constexpr int RunId = need_indices ? 3 : 1;
|
||||
gridwise_2d_reduce::template Run<RunId>(
|
||||
src2dDesc,
|
||||
dst1dDesc,
|
||||
origReduceLen,
|
||||
alpha,
|
||||
static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
|
||||
beta,
|
||||
static_cast<dstDataType* const __restrict__>(p_dst_global),
|
||||
static_cast<const int* const __restrict__>(ws_buf2_global),
|
||||
static_cast<int* const __restrict__>(indices_global));
|
||||
};
|
||||
Reference in New Issue
Block a user