mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 18:42:06 +00:00
* Add reduction across all dims cases.
* host softmax: handle all reduce
* Test cases when reduced dim is not innermost axis.
* Fix syntax.
* Test non innermost dim for fp32 and int8
* Group test suites wrt NumReduceDim.
* Additionally test failing cases.
* Throw error when Rank or NumReduceDims doesn't match arguments.
* Check reducedDims has correct values
* Move don't reuse DeviceReduceMultiblock IsSupportedArgument method.
Instead implement own. (in fact just get rid of one check to enable
reduction across inner dimensions).
* Reorganize unit tests to better cover use scenarios.
* Test input validation
* Test reduction of inner dimensions with custom op instances.
* Refactor fp32 and int8 unit tests.
* Fix FP32 instance template parameters.
* Add more instances.
* Instances with InSrcVectorDim=0.
* Do not initialize and copy data when arg not supported.
* ckProfiler Softmax use instance factory.
* Refactor device softmax IsSupported.
* Additionally add non-polymorphic api functions
* Split softmax instances into multiple files.
* Fix profiler.
* Reorganize tests to reuse profiler and cover edge cases.
* Clang-format
* I8 Softmax instances along with UT.
* Reuse type alias definitions from instance factory header.
* Clean included headers
* Fix variable names.
* Add missing checks in Argument constructor.
Co-authored-by: Adam Osewski <aosewski@amd.com>
Co-authored-by: Anthony Chang <ac.chang@outlook.com>
[ROCm/composable_kernel commit: 6d8614ee50]
143 lines
5.5 KiB
C++
143 lines
5.5 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
|
|
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
|
#include "include/ck/utility/data_type.hpp"
|
|
#include "profiler/include/profile_softmax_impl.hpp"
|
|
|
|
namespace ck {
|
|
|
|
template <typename Range>
|
|
std::string serialize_range(const Range& range)
|
|
{
|
|
std::stringstream ss;
|
|
for(auto& r : range)
|
|
{
|
|
ss << r << ", ";
|
|
}
|
|
std::string str = ss.str();
|
|
return std::string(str.begin(), str.end() - 2);
|
|
}
|
|
|
|
template <typename Tuple>
|
|
class TestSoftmax : public ::testing::Test
|
|
{
|
|
protected:
|
|
using InDataType = std::tuple_element_t<0, Tuple>;
|
|
using AccDataType = std::tuple_element_t<1, Tuple>;
|
|
using OutDataType = std::tuple_element_t<2, Tuple>;
|
|
static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value;
|
|
|
|
public:
|
|
std::vector<std::vector<index_t>> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}};
|
|
std::vector<std::vector<AccDataType>> scales_ = {{2, 0}, {0, 2}, {2, 2}};
|
|
bool bench_ = false; // measure kernel performance
|
|
bool verify_ = true;
|
|
|
|
void SetUp() override
|
|
{
|
|
if constexpr(Rank == 4)
|
|
{
|
|
in_lengths_ = std::vector<std::vector<index_t>>{
|
|
{1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}};
|
|
}
|
|
}
|
|
|
|
void RunSingle(std::vector<index_t> in_length,
|
|
std::vector<index_t> reduce_dims,
|
|
AccDataType alpha,
|
|
AccDataType beta)
|
|
{
|
|
int init_method = 1; // integer value initialization
|
|
bool log = false;
|
|
std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
|
|
bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
|
|
verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
void Run(std::vector<index_t> reduce_dims = {})
|
|
{
|
|
if(reduce_dims.empty())
|
|
{
|
|
reduce_dims.push_back(Rank - 1);
|
|
}
|
|
|
|
for(auto in_length : this->in_lengths_)
|
|
{
|
|
for(auto scale : this->scales_)
|
|
{
|
|
this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
template <index_t Rank,
|
|
index_t NumReduceDim,
|
|
index_t BlockSize,
|
|
index_t MThreadClusterSize,
|
|
index_t KThreadClusterSize,
|
|
index_t MThreadSliceSize,
|
|
index_t KThreadSliceSize,
|
|
index_t InSrcVectorDim,
|
|
index_t InSrcVectorSize,
|
|
index_t OutDstVectorSize>
|
|
struct DeviceSoftmaxInstanceWrapper
|
|
{
|
|
using F16 = half_t;
|
|
using F32 = float;
|
|
using Pass = tensor_operation::element_wise::PassThrough;
|
|
|
|
using InDataType = F16;
|
|
using AccDataType = F32;
|
|
using OutDataType = F16;
|
|
using InElementOp = Pass;
|
|
using AccElementOp = Pass;
|
|
|
|
using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
|
|
AccDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
AccElementOp,
|
|
Rank,
|
|
NumReduceDim,
|
|
BlockSize,
|
|
MThreadClusterSize,
|
|
KThreadClusterSize,
|
|
MThreadSliceSize,
|
|
KThreadSliceSize,
|
|
InSrcVectorDim,
|
|
InSrcVectorSize,
|
|
OutDstVectorSize>;
|
|
|
|
bool IsSupported(const std::vector<index_t> in_lengths,
|
|
const std::vector<index_t> in_strides,
|
|
const std::vector<index_t> reduce_dims) const
|
|
{
|
|
auto softmax = DeviceSoftmaxInstance{};
|
|
auto argument = softmax.MakeArgument(in_lengths,
|
|
in_strides,
|
|
reduce_dims,
|
|
1, // alpha
|
|
1, // beta
|
|
nullptr, // in_dev
|
|
nullptr, // in_out
|
|
Pass{}, // in elementwise op
|
|
Pass{}); // acc elementwise op
|
|
return softmax.IsSupportedArgument(argument);
|
|
}
|
|
};
|
|
|
|
} // namespace ck
|