mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-23 16:47:40 +00:00
90 lines
2.7 KiB
C++
90 lines
2.7 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host/host_tensor.hpp"
|
|
#include <thread>
|
|
|
|
namespace ck_tile {
|
|
|
|
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
|
CK_TILE_HOST void
|
|
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
|
|
{
|
|
index_t rank = x.get_num_of_dimension();
|
|
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
|
|
assert(dim == -1 || dim < rank);
|
|
|
|
index_t target_dim = dim == -1 ? (rank - 1) : dim;
|
|
index_t softmax_len = x.get_length(target_dim);
|
|
index_t n_parallel = x.get_element_size() / softmax_len;
|
|
auto x_len = x.get_lengths();
|
|
|
|
auto f = [&](auto i_element) {
|
|
std::vector<size_t> coord = [&]() {
|
|
std::vector<size_t> t_(rank, 0);
|
|
size_t r = i_element;
|
|
for(index_t i = rank - 1; i >= 0; i--)
|
|
{
|
|
if(i == target_dim)
|
|
continue;
|
|
t_[i] = r % x_len[i];
|
|
r = r / x_len[i];
|
|
}
|
|
return t_;
|
|
}();
|
|
|
|
ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
|
|
|
|
// compute max
|
|
for(auto idx = 0; idx < softmax_len; idx++)
|
|
{
|
|
auto c_ = coord;
|
|
c_[target_dim] = idx;
|
|
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
|
v_max = v_max < v_x ? v_x : v_max;
|
|
}
|
|
|
|
ComputeType v_exp_sum = static_cast<ComputeType>(0);
|
|
|
|
// sum
|
|
for(auto idx = 0; idx < softmax_len; idx++)
|
|
{
|
|
auto c_ = coord;
|
|
c_[target_dim] = idx;
|
|
|
|
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
|
|
|
v_exp_sum += ck_tile::exp(v_x - v_max);
|
|
}
|
|
|
|
// elementwise
|
|
for(auto idx = 0; idx < softmax_len; idx++)
|
|
{
|
|
auto c_ = coord;
|
|
c_[target_dim] = idx;
|
|
|
|
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
|
|
|
auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
|
|
|
|
y(c_) = ck_tile::type_convert<OutputType>(out);
|
|
}
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
|
CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
|
|
{
|
|
HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
|
|
|
|
reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
|
|
|
|
return y;
|
|
}
|
|
} // namespace ck_tile
|