Standalone softmax kernel (#284)

* initial stub for standalone softmax

* start device_softmax_mk_to_mk as a wrapper to device_reduce_mk_to_m

* host softmax validates

* compiles; to implement beta scaling

* use NaN trick to efficiently ignore OOB values during sum of exponentials

* freeload device_reduce's utility functions

* clean up interface

* adding prior value (beta scaling)

* remove restriction related to perf considerations

* apply clang-format

* clean; disable diagnostics

* resolve conflicts

* add exp wrapper

* honor HostTensorDesc interface; allow implicit cast from different vector<T> type

* test softmax for fp16/fp32

* update readme

* amend commit NaN trick

* remove redundant param added during development

* format

* replace ScalarDataType with AccDataType

* separate out test programs by precision type

* move softmax sample code to its own folder

* format

* keep up with recent changes in reduction API

* remove extra header
This commit is contained in:
Anthony Chang
2022-06-22 03:59:19 +08:00
committed by GitHub
parent be60d60d7a
commit 15c89e81f0
21 changed files with 1371 additions and 41 deletions

View File

@@ -45,7 +45,9 @@ template <typename AccDataType,
typename ThreadClusterLengths_M_K,
typename ThreadClusterArrangeOrder,
typename OpReduce,
bool PropagateNan>
bool PropagateNan,
typename Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
struct PartitionedBlockwiseReduction
{
static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -62,8 +64,6 @@ struct PartitionedBlockwiseReduction
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
template <typename BufferType>
__device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
{
@@ -113,13 +113,16 @@ struct PartitionedBlockwiseReduction
// 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// clang-format on
template <typename AccDataType,
typename IndexDataType,
index_t BlockSize,
typename ThreadClusterLengths_M_K,
typename ThreadClusterArrangeOrder,
typename OpReduce,
bool PropagateNan>
template <
typename AccDataType,
typename IndexDataType,
index_t BlockSize,
typename ThreadClusterLengths_M_K,
typename ThreadClusterArrangeOrder,
typename OpReduce,
bool PropagateNan,
typename Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
struct PartitionedBlockwiseReductionWithIndex
{
static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -136,9 +139,6 @@ struct PartitionedBlockwiseReductionWithIndex
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
// This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce(BufferType& work_val_buffer,