topk_softmax (#1592)

* topk_softmax * remove some file * fix atomix linear_offset * address various comment, and change sfc get_index api to static(tuple)
2026-05-01 20:21:23 +00:00 · 2024-10-26 23:52:49 +08:00
parent 31bf253aeb
commit b098b71b05
41 changed files with 5603 additions and 226 deletions
--- a/include/ck_tile/host/reference/reference_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -9,43 +9,81 @@

 namespace ck_tile {

-template <typename ADataType, typename AccDataType, typename BDataType>
-CK_TILE_HOST void reference_softmax(const HostTensor<ADataType>& a_m_n,
-                                    HostTensor<BDataType>& b_m_n)
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST void
+reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
 {
-    auto f = [&](auto m) {
-        const int N = a_m_n.mDesc.get_lengths()[1];
+    index_t rank = x.get_num_of_dimension();
+    assert(rank == y.get_num_of_dimension());
+    assert(dim == -1 || dim < rank);

-        AccDataType v_max = ck_tile::numeric<ADataType>::Lowest();
+    index_t target_dim  = dim == -1 ? (rank - 1) : dim;
+    index_t softmax_len = x.get_length(target_dim);
+    index_t n_parallel  = x.get_element_size() / softmax_len;
+    auto x_len          = x.get_lengths();

-        // max
-        for(int n = 0; n < N; ++n)
+    auto f = [&](auto i_element) {
+        std::vector<size_t> coord = [&]() {
+            std::vector<size_t> t_(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--)
+            {
+                if(i == target_dim)
+                    continue;
+                t_[i] = r % x_len[i];
+                r     = r / x_len[i];
+            }
+            return t_;
+        }();
+
+        ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
+
+        // compute max
+        for(auto idx = 0; idx < softmax_len; idx++)
        {
-            const ADataType v_a = a_m_n(m, n);
-
-            v_max = v_max < v_a ? v_a : v_max;
+            auto c_               = coord;
+            c_[target_dim]        = idx;
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+            v_max                 = v_max < v_x ? v_x : v_max;
        }

-        AccDataType v_exp_sum = 0;
+        ComputeType v_exp_sum = static_cast<ComputeType>(0);

        // sum
-        for(int n = 0; n < N; ++n)
+        for(auto idx = 0; idx < softmax_len; idx++)
        {
-            const ADataType v_a = a_m_n(m, n);
+            auto c_        = coord;
+            c_[target_dim] = idx;

-            v_exp_sum += ck_tile::exp(v_a - v_max);
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            v_exp_sum += ck_tile::exp(v_x - v_max);
        }

        // elementwise
-        for(int n = 0; n < N; ++n)
+        for(auto idx = 0; idx < softmax_len; idx++)
        {
-            const ADataType v_a = a_m_n(m, n);
+            auto c_        = coord;
+            c_[target_dim] = idx;

-            b_m_n(m, n) = ck_tile::exp(v_a - v_max) / v_exp_sum;
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
+
+            y(c_) = ck_tile::type_convert<OutputType>(out);
        }
    };

-    make_ParallelTensorFunctor(f,
-                               b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
+}
+
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
+{
+    HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
+
+    reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
+
+    return y;
 }
 } // namespace ck_tile