Files
composable_kernel/test/layernorm/test_layernorm_fp16.cpp
rocking5566 7f21662089 Standalone layernorm (#315)
* Implement layernorm kernel and deviceOp

* verify gpu kernel with host code

* 1. Separate gamma aand beta from affine
2. Check if argument is valid

* clean

* Sync the naming

* Support sweep once mode if we can put k dimension data inside one block

* [What] Get length from upper length.
[Why] if we get length directly, we may get length after padding.

* We only use one block in K dimension.
Hence, we can simplify the indexing of global R/W.

* Use 1d descriptor for gamma and beta

* Add accElementwiseOp

* Extract layernorm host code

* Support different YVectorDim in GridwiseLayernorm

* Rename XSrcVectorDim to XYSrcVectorDim. Because we use same parameter in deviceOp

* Gamma and beta can share the VGPR.

* Add test for fp32 and fp16

* Fix bug of concurrency and add test case which may fail orignally

* Propagate NaN for layernorm

Co-authored-by: Chao Liu <chao.liu2@amd.com>
2022-07-13 11:16:14 -05:00

30 lines
1.9 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_layernorm_util.hpp"
template <ck::index_t N>
using I = ck::Number<N>;
template <typename Tuple>
class TestLayernormFP16 : public ck::TestLayernorm<Tuple>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>
>;
// clang-format on
TYPED_TEST_SUITE(TestLayernormFP16, KernelTypes);
TYPED_TEST(TestLayernormFP16, Test_FP16) { this->Run(); }