Group norm (#417)

* Add groupnorm example by layernorm
1.  Reference is not ready
2. shape of gamma and beta need to be fix

* Let shape of gamma and beta can be same as x

* Modify test, instance and client example

* [What] Fix bug of layernorm for greater than 2 dimension.
[Why] We need to get upper length from merge transform instead of embed transform.

* Add reference for groupnorm

* Fuse sigmoid after groupnorm

* [What] Rename original layernorm into layernorm2d
[Why] Prepare to add groupnorm using layernorm5d

* clang-format

* Add groupnorm test

* Refine error message

* Add groupnorm ckProfiler

* Test groupnorm kernel from device_instance

* update example

* upadte profiler

* Fix test naming

* Fix argc number

* Move descriptor and sweeponce to argument for quick debugging

Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
rocking5566
2022-09-20 11:30:46 +08:00
committed by GitHub
parent f584ab0c54
commit 4eba345f6e
24 changed files with 1218 additions and 416 deletions

View File

@@ -6,8 +6,8 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -15,26 +15,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
std::vector<index_t> strideGamma,
std::vector<index_t> strideBeta)
{
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2)
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
// add device normalization instances
constexpr int NumReduceDim = Rank - 1;
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim>>
instances;
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value &&
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances);
}
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances);
}
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim>;
if(instances.size() <= 0)
{
throw std::runtime_error("wrong! no device normalization instance found");
}
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
for(auto& inst_ptr : instances)
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
strideXY,
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", length, "], ") << std::endl;
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
return;
continue;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();