Fused elementwise layernorm (#468)

* add fused addition lyernorm

* add fused addition lyernorm

* changed CMakelist

* removed annotates

* modified descriptor of C

* fixed bug in gridwise add layernorm

* format the files

* modified name from add&layernorm into elementwise&layernorm

* created fused elementwise layernorm branch

* change input into tuple type

* add sweep once to reduce load & read of C from global memory

* modified Argument api

* modified way to malloc c in global memory

* changed gamma and beta to m_k_desc

* fixed bug when sweep once and move CDataType when define device level struct

* add src dim for gamma and beta

* implement optimization for coalesced

* delete a annotation line

* fixed some bug to meet the requirements of ck

* add bandwidth computing in example, and fixed the time unit

* move device_elementwise_layernorm_impl.hpp into device/impl

* fixed bug in device_elementwise_layernorm_impl.hpp

* changed name from layernorm into normalization

* clang-format the changed files

* changed the names

* moved immidiate results into lds, it become faster in non-sweeponce cases

* changed naming of C into X to make the defination more clear

* changed naming in example

* add tests for elementwise normalization

* move example_elementwise_layernorm_blockwise into folder 44_elementwise_normalization

* move test_elementwise_layernorm_fp16 into new folder

* move elementwise_normalization_instances into a new folder

* add more tests in test_elementwise_layernorm_fp16.cpp

* added some corner cases in test

* fixed method to compute lds size for matrix X

* changed name of 44_elementwise_normalization into 45_elementwise_normalization

* modified some comments

* modified some other confused comments

* reduce redundant tests in test_elementwise_layernorm_fp16.cpp

[ROCm/composable_kernel commit: efbcc6eddc]
This commit is contained in:
guangzlu
2022-10-25 10:23:20 +08:00
committed by GitHub
parent 8554a33418
commit ba4d3575b2
14 changed files with 1814 additions and 4 deletions

View File

@@ -52,3 +52,4 @@ add_subdirectory(block_to_ctile_map)
add_subdirectory(softmax)
add_subdirectory(normalization)
add_subdirectory(data_type)
add_subdirectory(elementwise_normalization)

View File

@@ -0,0 +1,7 @@
add_custom_target(test_elementwise_normalization)
add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)

View File

@@ -0,0 +1,47 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestElementwiseLayernorm : public ::testing::Test
{
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
using GammaDataType = std::tuple_element_t<2, Tuple>;
using BetaDataType = std::tuple_element_t<3, Tuple>;
using AccDataType = std::tuple_element_t<4, Tuple>;
using YDataType = std::tuple_element_t<5, Tuple>;
void Run()
{
// M, N
std::vector<std::vector<ck::index_t>> lengths = {
{1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}};
for(auto length : lengths)
{
bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
BDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<
// ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
std::tuple<F16, F16, F16, F16, F32, F16>>;
TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }

View File

@@ -3,9 +3,9 @@ add_custom_target(test_layernorm)
add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
target_link_libraries(test_layernorm2d_fp32 PRIVATE utility)
target_link_libraries(test_layernorm2d_fp32 PRIVATE utility)
target_link_libraries(test_layernorm2d_fp16 PRIVATE utility)
target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
@@ -14,4 +14,3 @@ add_dependencies(test_layernorm test_layernorm2d_fp32)
add_dependencies(test_layernorm test_layernorm2d_fp16)
add_dependencies(test_layernorm test_groupnorm_fp16)
add_dependencies(test_layernorm test_groupnorm_fp32)