mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
Group norm (#417)
* Add groupnorm example by layernorm 1. Reference is not ready 2. shape of gamma and beta need to be fix * Let shape of gamma and beta can be same as x * Modify test, instance and client example * [What] Fix bug of layernorm for greater than 2 dimension. [Why] We need to get upper length from merge transform instead of embed transform. * Add reference for groupnorm * Fuse sigmoid after groupnorm * [What] Rename original layernorm into layernorm2d [Why] Prepare to add groupnorm using layernorm5d * clang-format * Add groupnorm test * Refine error message * Add groupnorm ckProfiler * Test groupnorm kernel from device_instance * update example * upadte profiler * Fix test naming * Fix argc number * Move descriptor and sweeponce to argument for quick debugging Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
@@ -6,8 +6,8 @@
|
||||
#include <iomanip>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "profiler/include/data_type_enum.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -15,26 +15,6 @@
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
void add_device_layernorm_f16_rank2_instances(
|
||||
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
|
||||
|
||||
void add_device_layernorm_f32_rank2_instances(
|
||||
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
|
||||
std::vector<index_t> strideGamma,
|
||||
std::vector<index_t> strideBeta)
|
||||
{
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
if(length.size() < 2)
|
||||
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
|
||||
gamma_dev.ToDevice(gamma.mData.data());
|
||||
beta_dev.ToDevice(beta.mData.data());
|
||||
|
||||
// add device normalization instances
|
||||
constexpr int NumReduceDim = Rank - 1;
|
||||
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
AccDataType,
|
||||
YDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>>
|
||||
instances;
|
||||
|
||||
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value &&
|
||||
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value &&
|
||||
is_same<AccDataType, F32>::value)
|
||||
{
|
||||
if(length.size() == 2)
|
||||
tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances);
|
||||
}
|
||||
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
|
||||
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
|
||||
is_same<AccDataType, F32>::value)
|
||||
{
|
||||
if(length.size() == 2)
|
||||
tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances);
|
||||
}
|
||||
// add device normalization instances
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
AccDataType,
|
||||
YDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
if(instances.size() <= 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! no device normalization instance found");
|
||||
}
|
||||
// get device op instances
|
||||
const auto instance_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
|
||||
for(auto& inst_ptr : instances)
|
||||
for(auto& inst_ptr : instance_ptrs)
|
||||
{
|
||||
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
|
||||
strideXY,
|
||||
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
|
||||
if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
|
||||
LogRange(std::cout << "input lengths = [", length, "], ") << std::endl;
|
||||
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
|
||||
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
|
||||
|
||||
Reference in New Issue
Block a user