ckProfiler for layernorm (#330)

* Refine parameter

* Add base class for layernorm

* Add layernorm instance

* Add layernorm to ckProfiler

* Remove redundant

* Add verification

* Fix compile error due to merge
This commit is contained in:
rocking5566
2022-08-12 06:03:54 +08:00
committed by GitHub
parent e08d68d25d
commit fdfd7eb597
12 changed files with 544 additions and 21 deletions

View File

@@ -1,5 +1,7 @@
# device_normalization_instance
set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
device_layernorm_f16_instance.cpp
device_layernorm_f32_instance.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f16_f16_instance.cpp
)

View File

@@ -0,0 +1,53 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
template <index_t Rank, index_t Reduce>
using device_layernorm_f16_instances = std::tuple<
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>, // fallback kernel
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8, 8, 8>,
DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8, 8, 8>
// clang-format on
>;
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceNormalization2Ptr<F16, F16, F16, F32, F16, Pass, 2, 1>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<2, 1>{});
}
void add_device_layernorm_f16_rank4_instances(
std::vector<DeviceNormalization2Ptr<F16, F16, F16, F32, F16, Pass, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<4, 3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,51 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
template <index_t Rank, index_t Reduce>
using device_layernorm_f32_instances = std::tuple<
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4, 4, 4>,
DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4, 4, 4>
// clang-format on
>;
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceNormalization2Ptr<F32, F32, F32, F32, F32, Pass, 2, 1>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f32_instances<2, 1>{});
}
void add_device_layernorm_f32_rank4_instances(
std::vector<DeviceNormalization2Ptr<F32, F32, F32, F32, F32, Pass, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f32_instances<4, 3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck