mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
* Implement layernorm kernel and deviceOp
* verify gpu kernel with host code
* 1. Separate gamma aand beta from affine
2. Check if argument is valid
* clean
* Sync the naming
* Support sweep once mode if we can put k dimension data inside one block
* [What] Get length from upper length.
[Why] if we get length directly, we may get length after padding.
* We only use one block in K dimension.
Hence, we can simplify the indexing of global R/W.
* Use 1d descriptor for gamma and beta
* Add accElementwiseOp
* Extract layernorm host code
* Support different YVectorDim in GridwiseLayernorm
* Rename XSrcVectorDim to XYSrcVectorDim. Because we use same parameter in deviceOp
* Gamma and beta can share the VGPR.
* Add test for fp32 and fp16
* Fix bug of concurrency and add test case which may fail orignally
* Propagate NaN for layernorm
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 7f21662089]
30 lines
1.7 KiB
C++
30 lines
1.7 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#include "gtest/gtest.h"
|
|
#include "test_layernorm_util.hpp"
|
|
|
|
template <ck::index_t N>
|
|
using I = ck::Number<N>;
|
|
|
|
template <typename Tuple>
|
|
class TestLayernormFP32 : public ck::TestLayernorm<Tuple>
|
|
{
|
|
};
|
|
|
|
// clang-format off
|
|
using KernelTypes = ::testing::Types<
|
|
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
|
|
std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>
|
|
>;
|
|
// clang-format on
|
|
TYPED_TEST_SUITE(TestLayernormFP32, KernelTypes);
|
|
TYPED_TEST(TestLayernormFP32, Test_FP32) { this->Run(); }
|