mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-22 08:07:38 +00:00
245 lines
9.0 KiB
C++
245 lines
9.0 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
|
|
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
|
#include "include/ck/utility/data_type.hpp"
|
|
#include "profiler/profile_softmax_impl.hpp"
|
|
|
|
static ck::index_t param_mask = 0xffff;
|
|
static ck::index_t instance_index = -1;
|
|
|
|
namespace ck {
|
|
|
|
template <typename Range>
|
|
std::string serialize_range(const Range& range)
|
|
{
|
|
std::stringstream ss;
|
|
for(auto& r : range)
|
|
{
|
|
ss << r << ", ";
|
|
}
|
|
std::string str = ss.str();
|
|
return std::string(str.begin(), str.end() - 2);
|
|
}
|
|
|
|
template <typename Tuple>
|
|
class TestSoftmax : public ::testing::Test
|
|
{
|
|
protected:
|
|
using InDataType = std::tuple_element_t<0, Tuple>;
|
|
using AccDataType = std::tuple_element_t<1, Tuple>;
|
|
using OutDataType = std::tuple_element_t<2, Tuple>;
|
|
static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value;
|
|
|
|
public:
|
|
std::vector<std::vector<index_t>> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}};
|
|
std::vector<std::vector<AccDataType>> scales_ = {{2, 0}, {0, 2}, {2, 2}};
|
|
bool bench_ = false; // measure kernel performance
|
|
bool verify_ = true;
|
|
|
|
void SetUp() override
|
|
{
|
|
if constexpr(Rank == 4)
|
|
{
|
|
in_lengths_ = std::vector<std::vector<index_t>>{
|
|
{1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}};
|
|
}
|
|
}
|
|
|
|
void RunSingle(std::vector<index_t> in_length,
|
|
std::vector<index_t> reduce_dims,
|
|
AccDataType alpha,
|
|
AccDataType beta,
|
|
index_t instance_index)
|
|
{
|
|
int init_method = 1; // integer value initialization
|
|
bool log = false;
|
|
std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
|
|
bool pass = false;
|
|
|
|
if constexpr(Rank == 3)
|
|
{
|
|
if(reduce_dims.size() == 1)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
else if(reduce_dims.size() == 2)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
else if(reduce_dims.size() == 3)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
}
|
|
else if constexpr(Rank == 4)
|
|
{
|
|
if(reduce_dims.size() == 1)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
else if(reduce_dims.size() == 2)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
else if(reduce_dims.size() == 3)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
else if(reduce_dims.size() == 4)
|
|
pass = ck::profiler::
|
|
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(
|
|
verify_,
|
|
init_method,
|
|
log,
|
|
bench_,
|
|
in_length,
|
|
strides,
|
|
reduce_dims,
|
|
alpha,
|
|
beta,
|
|
instance_index);
|
|
};
|
|
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
void Run(std::vector<index_t> reduce_dims = {})
|
|
{
|
|
if(reduce_dims.empty())
|
|
{
|
|
reduce_dims.push_back(Rank - 1);
|
|
}
|
|
|
|
for(auto in_length : this->in_lengths_)
|
|
{
|
|
for(auto scale : this->scales_)
|
|
{
|
|
this->RunSingle(in_length, reduce_dims, scale[0], scale[1], instance_index);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
template <index_t Rank,
|
|
index_t NumReduceDim,
|
|
index_t BlockSize,
|
|
index_t MThreadClusterSize,
|
|
index_t KThreadClusterSize,
|
|
index_t MThreadSliceSize,
|
|
index_t KThreadSliceSize,
|
|
index_t InSrcVectorDim,
|
|
index_t InSrcVectorSize,
|
|
index_t OutDstVectorSize>
|
|
struct DeviceSoftmaxInstanceWrapper
|
|
{
|
|
using F16 = half_t;
|
|
using F32 = float;
|
|
using Pass = tensor_operation::element_wise::PassThrough;
|
|
|
|
using InDataType = F16;
|
|
using AccDataType = F32;
|
|
using OutDataType = F16;
|
|
using InElementOp = Pass;
|
|
using AccElementOp = Pass;
|
|
|
|
using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
|
|
AccDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
AccElementOp,
|
|
Rank,
|
|
NumReduceDim,
|
|
BlockSize,
|
|
MThreadClusterSize,
|
|
KThreadClusterSize,
|
|
MThreadSliceSize,
|
|
KThreadSliceSize,
|
|
InSrcVectorDim,
|
|
InSrcVectorSize,
|
|
OutDstVectorSize>;
|
|
|
|
bool IsSupported(const std::vector<index_t> in_lengths,
|
|
const std::vector<index_t> in_strides,
|
|
const std::vector<index_t> reduce_dims) const
|
|
{
|
|
auto softmax = DeviceSoftmaxInstance{};
|
|
auto argument = softmax.MakeArgument(in_lengths,
|
|
in_strides,
|
|
reduce_dims,
|
|
1, // alpha
|
|
1, // beta
|
|
nullptr, // in_dev
|
|
nullptr, // in_out
|
|
Pass{}, // in elementwise op
|
|
Pass{}); // acc elementwise op
|
|
return softmax.IsSupportedArgument(argument);
|
|
}
|
|
};
|
|
|
|
} // namespace ck
|