Standalone softmax kernel (#284)

* initial stub for standalone softmax * start device_softmax_mk_to_mk as a wrapper to device_reduce_mk_to_m * host softmax validates * compiles; to implement beta scaling * use NaN trick to efficiently ignore OOB values during sum of exponentials * freeload device_reduce's utility functions * clean up interface * adding prior value (beta scaling) * remove restriction related to perf considerations * apply clang-format * clean; disable diagnostics * resolve conflicts * add exp wrapper * honor HostTensorDesc interface; allow implicit cast from different vector<T> type * test softmax for fp16/fp32 * update readme * amend commit NaN trick * remove redundant param added during development * format * replace ScalarDataType with AccDataType * separate out test programs by precision type * move softmax sample code to its own folder * format * keep up with recent changes in reduction API * remove extra header
2026-05-03 21:21:22 +00:00 · 2022-06-22 03:59:19 +08:00
parent be60d60d7a
commit 15c89e81f0
21 changed files with 1371 additions and 41 deletions
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -45,7 +45,9 @@ template <typename AccDataType,
          typename ThreadClusterLengths_M_K,
          typename ThreadClusterArrangeOrder,
          typename OpReduce,
-          bool PropagateNan>
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
 struct PartitionedBlockwiseReduction
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -62,8 +64,6 @@ struct PartitionedBlockwiseReduction
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});

-    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
-
    template <typename BufferType>
    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
    {
@@ -113,13 +113,16 @@ struct PartitionedBlockwiseReduction
 //  3) in_out_value/in_out_index is the input data in vgpr from each thread
 //  4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
 // clang-format on
-template <typename AccDataType,
-          typename IndexDataType,
-          index_t BlockSize,
-          typename ThreadClusterLengths_M_K,
-          typename ThreadClusterArrangeOrder,
-          typename OpReduce,
-          bool PropagateNan>
+template <
+    typename AccDataType,
+    typename IndexDataType,
+    index_t BlockSize,
+    typename ThreadClusterLengths_M_K,
+    typename ThreadClusterArrangeOrder,
+    typename OpReduce,
+    bool PropagateNan,
+    typename Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
 struct PartitionedBlockwiseReductionWithIndex
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -136,9 +139,6 @@ struct PartitionedBlockwiseReductionWithIndex
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});

-    using Accumulation =
-        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
-
    // This interface accumulates on both data values and indices
    template <typename BufferType, typename IdxBufferType>
    __device__ static void Reduce(BufferType& work_val_buffer,