mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
layernorm & groupnorm bwd gamma beta (#1133)
* Add layernorm bwd gamma beta external api
* Add groupnorm external api
* Add layernorm bwd gamma beta profiler
* Add groupnorm bwd gamma beta ckProfiler
* Add layernorm & groupnorm bwd gamma beta test
* Fix groupnorm bwd gamma beta profiler bug
* Layernorm bwd weight client example
* Groupnorm bwd weight client example
* clang format
* Remove useless header
* Let inv_std be positive
* Rename to num_bytes and move this calculation outside the loop
[ROCm/composable_kernel commit: 28f68a5a99]
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp)
|
||||
target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_layernorm2d_bwd_gamma_beta layernorm2d_bwd_gamma_beta.cpp)
|
||||
target_link_libraries(client_layernorm2d_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
|
||||
target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
|
||||
171
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
Normal file
171
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,171 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DGammaDataType = float;
|
||||
using DBetaDataType = float;
|
||||
|
||||
constexpr int Rank = 2;
|
||||
constexpr int NumReduceDim = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * N);
|
||||
SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * N);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::size_t num_bytes = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
|
||||
sizeof(MeanInvStdDataType) * M * 2 + sizeof(DGammaDataType) * N +
|
||||
sizeof(DBetaDataType) * N;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N}, // outLengths
|
||||
{1}, // dgammaStrides
|
||||
{1}, // dbetaStrides
|
||||
{0}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N}, // outLengths
|
||||
{1}, // dgammaStrides
|
||||
{1}, // dbetaStrides
|
||||
{0}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,5 +1,8 @@
|
||||
add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp)
|
||||
target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_groupnorm_bwd_gamma_beta groupnorm_bwd_gamma_beta.cpp)
|
||||
target_link_libraries(client_groupnorm_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp)
|
||||
target_link_libraries(client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
180
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
Normal file
180
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DGammaDataType = float;
|
||||
using DBetaDataType = float;
|
||||
|
||||
constexpr int Rank = 5;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 32;
|
||||
ck::index_t H = 16;
|
||||
ck::index_t W = 16;
|
||||
ck::index_t G = 64;
|
||||
ck::index_t C = 128;
|
||||
|
||||
std::size_t length = N * H * W * G * C;
|
||||
|
||||
std::vector<ck::index_t> strideDy = {H * W * G * C, W * G * C, G * C, C, 1};
|
||||
std::vector<ck::index_t> strideX = strideDy;
|
||||
std::vector<ck::index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
|
||||
std::vector<ck::index_t> strideDGammaBeta = {C, 1};
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * length);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * length);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * G * C);
|
||||
SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * G * C);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::size_t num_bytes = sizeof(DYDataType) * length + sizeof(XDataType) * length +
|
||||
sizeof(GammaDataType) * G * C + sizeof(MeanInvStdDataType) * N * G * 2 +
|
||||
sizeof(DGammaDataType) * G * C + sizeof(DBetaDataType) * G * C;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
{G, C},
|
||||
strideDGammaBeta,
|
||||
strideDGammaBeta,
|
||||
{0, 1, 2}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
{G, C},
|
||||
strideDGammaBeta,
|
||||
strideDGammaBeta,
|
||||
{0, 1, 2}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
#ifdef CK_ENABLE_FP32
|
||||
// FP32
|
||||
void add_device_groupnorm_bwd_gamma_beta_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 5, 3>>>&);
|
||||
#endif
|
||||
template <typename DYDataType,
|
||||
typename XDataType,
|
||||
typename MeanInvStdDataType,
|
||||
typename DGammaDataType,
|
||||
typename DBetaDataType>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
5,
|
||||
3>>
|
||||
{
|
||||
using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
5,
|
||||
3>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
|
||||
is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
|
||||
is_same_v<DBetaDataType, F32>)
|
||||
{
|
||||
add_device_groupnorm_bwd_gamma_beta_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,83 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
#ifdef CK_ENABLE_FP16
|
||||
// FP16
|
||||
void add_device_layernorm2d_bwd_gamma_beta_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F16, F16, F16, F16, F16, 2, 1>>>&);
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
// FP32
|
||||
void add_device_layernorm2d_bwd_gamma_beta_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 2, 1>>>&);
|
||||
#endif
|
||||
template <typename DYDataType,
|
||||
typename XDataType,
|
||||
typename MeanInvStdDataType,
|
||||
typename DGammaDataType,
|
||||
typename DBetaDataType,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>>
|
||||
{
|
||||
using DeviceOp = DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<DYDataType, F16> && is_same_v<XDataType, F16> &&
|
||||
is_same_v<MeanInvStdDataType, F16> && is_same_v<DGammaDataType, F16> &&
|
||||
is_same_v<DBetaDataType, F16>)
|
||||
{
|
||||
if constexpr(Rank == 2 && NumReduceDim == 1)
|
||||
{
|
||||
add_device_layernorm2d_bwd_gamma_beta_f16_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<DYDataType, F32> && is_same_v<XDataType, F32> &&
|
||||
is_same_v<MeanInvStdDataType, F32> && is_same_v<DGammaDataType, F32> &&
|
||||
is_same_v<DBetaDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 2 && NumReduceDim == 1)
|
||||
{
|
||||
add_device_layernorm2d_bwd_gamma_beta_f32_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -8,7 +8,7 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_layernorm2d_bwd_gamma_beta_rank_2_1_f16_instances(
|
||||
void add_device_layernorm2d_bwd_gamma_beta_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F16, F16, F16, F16, F16, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_layernorm2d_bwd_gamma_beta_rank_2_1_f32_instances(
|
||||
void add_device_layernorm2d_bwd_gamma_beta_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalizationBwdGammaBeta<F32, F32, F32, F32, F32, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,261 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
template <typename DYDataType,
|
||||
typename XDataType,
|
||||
typename MeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename DGammaDataType,
|
||||
typename DBetaDataType>
|
||||
bool profile_groupnorm_bwd_gamma_beta_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
std::vector<index_t> length)
|
||||
{
|
||||
// we don't need GammaDataType and DXDataType here, just for reference class
|
||||
using GammaDataType = DYDataType;
|
||||
using DXDataType = DYDataType;
|
||||
|
||||
if(length.size() != 5)
|
||||
return false;
|
||||
|
||||
index_t N = length[0];
|
||||
index_t G = length[3];
|
||||
index_t C = length[4];
|
||||
|
||||
std::vector<index_t> reduce_dim = {0, 1, 2};
|
||||
std::vector<index_t> gamma_beta_length = {G, C};
|
||||
|
||||
Tensor<DYDataType> dy(length);
|
||||
Tensor<XDataType> x(length);
|
||||
Tensor<GammaDataType> gamma(gamma_beta_length); // dummy tensor, for reference
|
||||
Tensor<MeanInvStdDataType> mean({N, G});
|
||||
Tensor<MeanInvStdDataType> inv_std({N, G});
|
||||
Tensor<DGammaDataType> dgamma(gamma_beta_length);
|
||||
Tensor<DBetaDataType> dbeta(gamma_beta_length);
|
||||
|
||||
Tensor<DXDataType> host_dx(length); // dummy tensor, for reference
|
||||
Tensor<DGammaDataType> host_dgamma(gamma_beta_length);
|
||||
Tensor<DBetaDataType> host_dbeta(gamma_beta_length);
|
||||
|
||||
std::vector<index_t> strideDy =
|
||||
std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
|
||||
std::vector<index_t> strideX =
|
||||
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
|
||||
|
||||
std::vector<index_t> strideDGamma{dgamma.mDesc.GetStrides().begin(),
|
||||
dgamma.mDesc.GetStrides().end()};
|
||||
|
||||
std::vector<index_t> strideDBeta{dbeta.mDesc.GetStrides().begin(),
|
||||
dbeta.mDesc.GetStrides().end()};
|
||||
|
||||
std::vector<index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
|
||||
x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
|
||||
mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_1<DGammaDataType>{});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_1<DBetaDataType>{});
|
||||
break;
|
||||
case 1:
|
||||
dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
|
||||
x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
|
||||
mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{0, 5});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_2<DGammaDataType>{-5, 5});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_2<DBetaDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
|
||||
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
|
||||
mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0, 0.5});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_3<DGammaDataType>{-0.5, 0.5});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_3<DBetaDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
|
||||
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
|
||||
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
|
||||
|
||||
dy_dev.ToDevice(dy.mData.data());
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
mean_dev.ToDevice(mean.mData.data());
|
||||
inv_std_dev.ToDevice(inv_std.mData.data());
|
||||
|
||||
// add device normalization instances
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
5,
|
||||
3>;
|
||||
|
||||
// get device op instances
|
||||
const auto instance_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ReferenceInstance =
|
||||
ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
DXDataType,
|
||||
ComputeDataType>;
|
||||
|
||||
ReferenceInstance ref;
|
||||
auto ref_argument =
|
||||
ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
|
||||
auto ref_invoker = ref.MakeInvoker();
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
|
||||
std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
|
||||
x.mDesc.GetElementSize() * sizeof(XDataType) +
|
||||
mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
|
||||
inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
|
||||
dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) +
|
||||
dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType);
|
||||
|
||||
int num_kernel = 0;
|
||||
|
||||
for(auto& inst_ptr : instance_ptrs)
|
||||
{
|
||||
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
gamma_beta_length,
|
||||
strideDGamma,
|
||||
strideDBeta,
|
||||
reduce_dim,
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
++num_kernel;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
|
||||
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
|
||||
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
if(time_kernel)
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< inst_ptr->GetTypeString() << std::endl;
|
||||
|
||||
if(avg_time < best_avg_time)
|
||||
{
|
||||
best_instance_name = inst_ptr->GetTypeString();
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
dgamma_dev.FromDevice(dgamma.mData.data());
|
||||
dbeta_dev.FromDevice(dbeta.mData.data());
|
||||
bool pass =
|
||||
ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
|
||||
|
||||
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "dy : ", dy.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "host_dgamma : ", host_dgamma.mData, ",")
|
||||
<< std::endl;
|
||||
LogRangeAsType<float>(std::cout << "dgamma : ", dgamma.mData, ",") << std::endl;
|
||||
}
|
||||
|
||||
if(!pass)
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
|
||||
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(time_kernel)
|
||||
std::cout << "pass" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
LogRange(std::cout << "length = ", length, ",") << ", ";
|
||||
LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
|
||||
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
|
||||
<< best_instance_name << std::endl;
|
||||
}
|
||||
|
||||
if(num_kernel == 0)
|
||||
{
|
||||
std::cout << "Error: No kernel is applicable" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,263 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
template <typename DYDataType,
|
||||
typename XDataType,
|
||||
typename MeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename DGammaDataType,
|
||||
typename DBetaDataType,
|
||||
index_t Rank>
|
||||
bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
std::vector<index_t> length)
|
||||
{
|
||||
// we don't need GammaDataType and DXDataType here, just for reference class
|
||||
using GammaDataType = DYDataType;
|
||||
using DXDataType = DYDataType;
|
||||
|
||||
if(length.size() != Rank || Rank < 2)
|
||||
return false;
|
||||
|
||||
// Assume normalize dimension for first dimension
|
||||
// Layernorm 2D, input = [M, K], reduce on M axis
|
||||
// Layernorm 4D, input = [N, H, W, C], redice on N axis
|
||||
constexpr int NumReduceDim = Rank - 1;
|
||||
|
||||
std::vector<index_t> reduce_dim = {0};
|
||||
std::vector<index_t> invarient_length{length.begin() + 1, length.end()};
|
||||
|
||||
Tensor<DYDataType> dy(length);
|
||||
Tensor<XDataType> x(length);
|
||||
Tensor<GammaDataType> gamma(invarient_length); // dummy tensor, for reference
|
||||
Tensor<MeanInvStdDataType> mean({length[0]});
|
||||
Tensor<MeanInvStdDataType> inv_std({length[0]});
|
||||
Tensor<DGammaDataType> dgamma(invarient_length);
|
||||
Tensor<DBetaDataType> dbeta(invarient_length);
|
||||
|
||||
Tensor<DXDataType> host_dx(length); // dummy tensor, for reference
|
||||
Tensor<DGammaDataType> host_dgamma(invarient_length);
|
||||
Tensor<DBetaDataType> host_dbeta(invarient_length);
|
||||
|
||||
std::vector<index_t> strideDy =
|
||||
std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
|
||||
std::vector<index_t> strideX = strideDy;
|
||||
|
||||
std::vector<index_t> strideDGamma{dgamma.mDesc.GetStrides().begin(),
|
||||
dgamma.mDesc.GetStrides().end()};
|
||||
|
||||
std::vector<index_t> strideDBeta{dbeta.mDesc.GetStrides().begin(),
|
||||
dbeta.mDesc.GetStrides().end()};
|
||||
|
||||
std::vector<index_t> strideMeanInvStd{Rank, 0};
|
||||
strideMeanInvStd[0] = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
|
||||
x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
|
||||
mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_1<DGammaDataType>{});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_1<DBetaDataType>{});
|
||||
break;
|
||||
case 1:
|
||||
dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
|
||||
x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
|
||||
mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{0, 5});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_2<DGammaDataType>{-5, 5});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_2<DBetaDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
|
||||
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
|
||||
mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
|
||||
inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0, 0.5});
|
||||
dgamma.GenerateTensorValue(GeneratorTensor_3<DGammaDataType>{-0.5, 0.5});
|
||||
dbeta.GenerateTensorValue(GeneratorTensor_3<DBetaDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
|
||||
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
|
||||
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
|
||||
|
||||
dy_dev.ToDevice(dy.mData.data());
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
mean_dev.ToDevice(mean.mData.data());
|
||||
inv_std_dev.ToDevice(inv_std.mData.data());
|
||||
|
||||
// add device normalization instances
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto instance_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ReferenceInstance =
|
||||
ck::tensor_operation::host::ReferenceLayernormBwd<DYDataType,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
DXDataType,
|
||||
ComputeDataType>;
|
||||
|
||||
ReferenceInstance ref;
|
||||
auto ref_argument =
|
||||
ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
|
||||
auto ref_invoker = ref.MakeInvoker();
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
|
||||
std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
|
||||
x.mDesc.GetElementSize() * sizeof(XDataType) +
|
||||
mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
|
||||
inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
|
||||
dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) +
|
||||
dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType);
|
||||
|
||||
int num_kernel = 0;
|
||||
|
||||
for(auto& inst_ptr : instance_ptrs)
|
||||
{
|
||||
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
invarient_length,
|
||||
strideDGamma,
|
||||
strideDBeta,
|
||||
reduce_dim,
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
++num_kernel;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
|
||||
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
|
||||
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
if(time_kernel)
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< inst_ptr->GetTypeString() << std::endl;
|
||||
|
||||
if(avg_time < best_avg_time)
|
||||
{
|
||||
best_instance_name = inst_ptr->GetTypeString();
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
dgamma_dev.FromDevice(dgamma.mData.data());
|
||||
dbeta_dev.FromDevice(dbeta.mData.data());
|
||||
bool pass =
|
||||
ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
|
||||
|
||||
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "dy : ", dy.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "host_dgamma : ", host_dgamma.mData, ",")
|
||||
<< std::endl;
|
||||
LogRangeAsType<float>(std::cout << "dgamma : ", dgamma.mData, ",") << std::endl;
|
||||
}
|
||||
|
||||
if(!pass)
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
|
||||
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(time_kernel)
|
||||
std::cout << "pass" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
LogRange(std::cout << "length = ", length, ",") << ", ";
|
||||
LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
|
||||
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
|
||||
<< best_instance_name << std::endl;
|
||||
}
|
||||
|
||||
if(num_kernel == 0)
|
||||
{
|
||||
std::cout << "Error: No kernel is applicable" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace ck
|
||||
@@ -19,6 +19,8 @@ set(PROFILER_SOURCES
|
||||
profile_groupnorm_bwd_data.cpp
|
||||
profile_groupnorm_fwd.cpp
|
||||
profile_layernorm_bwd_data.cpp
|
||||
profile_layernorm_bwd_gamma_beta.cpp
|
||||
profile_groupnorm_bwd_gamma_beta.cpp
|
||||
profile_layernorm_fwd.cpp
|
||||
profile_max_pool3d_fwd.cpp
|
||||
profile_avg_pool3d_bwd.cpp
|
||||
@@ -82,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
|
||||
|
||||
104
profiler/src/profile_groupnorm_bwd_gamma_beta.cpp
Normal file
104
profiler/src/profile_groupnorm_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,104 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
struct groupnormBwdGammaBetaArgParser
|
||||
{
|
||||
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
|
||||
|
||||
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
|
||||
{
|
||||
if(std::string("--") + key == argv[i])
|
||||
{
|
||||
int pos = i;
|
||||
while(++i < argc && argv[i][0] != '-') {}
|
||||
int end = i;
|
||||
for(int j = pos + 1; j < end; j++)
|
||||
{
|
||||
long_opts[key].push_back(std::stoi(argv[j]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void operator()(int argc, char* argv[])
|
||||
{
|
||||
for(auto& kv : long_opts)
|
||||
{
|
||||
for(int i = 1; i < argc; i++)
|
||||
{
|
||||
if(parse_opt(argc, argv, kv.first, i))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void print_help_groupnorm_bwd_gamma_beta()
|
||||
{
|
||||
// eg: ckProfiler groupnorm_bwd_gamma_beta 1 0 2 0 1 --length 1 16 16 32 40
|
||||
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
|
||||
<< "arg2: verification (0: no; 1: yes)\n"
|
||||
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
|
||||
<< "arg4: print tensor value (0: no; 1: yes)\n"
|
||||
<< "arg5: time kernel (0=no, 1=yes)\n"
|
||||
<< "--length: tensor extents (e.g, --length 1 16 16 32 40) \n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int profile_groupnorm_bwd_gamma_beta(int argc, char* argv[])
|
||||
{
|
||||
if(argc <= 2)
|
||||
{
|
||||
print_help_groupnorm_bwd_gamma_beta();
|
||||
return 0;
|
||||
}
|
||||
|
||||
groupnormBwdGammaBetaArgParser arg_parser;
|
||||
|
||||
// short unnamed options
|
||||
const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
|
||||
// parse the long options
|
||||
arg_parser(argc, argv);
|
||||
const std::vector<index_t> length = arg_parser.long_opts["length"];
|
||||
|
||||
using F32 = float;
|
||||
|
||||
if(length.size() == 5)
|
||||
{
|
||||
if(data_type == ck::DataTypeEnum::Float)
|
||||
{
|
||||
ck::profiler::profile_groupnorm_bwd_gamma_beta_impl<F32, F32, F32, F32, F32, F32>(
|
||||
do_verification, init_method, do_log, time_kernel, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("not implemented yet");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("length should be 5");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("groupnorm_bwd_gamma_beta",
|
||||
"Group Normalization",
|
||||
profile_groupnorm_bwd_gamma_beta);
|
||||
112
profiler/src/profile_layernorm_bwd_gamma_beta.cpp
Normal file
112
profiler/src/profile_layernorm_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,112 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
struct layernormBwdGammaBetaArgParser
|
||||
{
|
||||
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
|
||||
|
||||
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
|
||||
{
|
||||
if(std::string("--") + key == argv[i])
|
||||
{
|
||||
int pos = i;
|
||||
while(++i < argc && argv[i][0] != '-') {}
|
||||
int end = i;
|
||||
for(int j = pos + 1; j < end; j++)
|
||||
{
|
||||
long_opts[key].push_back(std::stoi(argv[j]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void operator()(int argc, char* argv[])
|
||||
{
|
||||
for(auto& kv : long_opts)
|
||||
{
|
||||
for(int i = 1; i < argc; i++)
|
||||
{
|
||||
if(parse_opt(argc, argv, kv.first, i))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void print_help_layernorm_bwd_gamma_beta()
|
||||
{
|
||||
// eg: ckProfiler layernorm_bwd_gamma_beta 0 0 2 0 1 --length 1502 4096
|
||||
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
|
||||
<< "arg2: verification (0: no; 1: yes)\n"
|
||||
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
|
||||
<< "arg4: print tensor value (0: no; 1: yes)\n"
|
||||
<< "arg5: time kernel (0=no, 1=yes)\n"
|
||||
<< "--length: tensor extents (e.g, --length 1024 1024) \n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int profile_layernorm_bwd_gamma_beta(int argc, char* argv[])
|
||||
{
|
||||
if(argc <= 2)
|
||||
{
|
||||
print_help_layernorm_bwd_gamma_beta();
|
||||
return 0;
|
||||
}
|
||||
|
||||
layernormBwdGammaBetaArgParser arg_parser;
|
||||
|
||||
// short unnamed options
|
||||
const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
|
||||
// parse the long options
|
||||
arg_parser(argc, argv);
|
||||
const std::vector<index_t> length = arg_parser.long_opts["length"];
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
if(length.size() == 2)
|
||||
{
|
||||
constexpr int rank = 2;
|
||||
|
||||
if(data_type == ck::DataTypeEnum::Half)
|
||||
{
|
||||
ck::profiler::profile_layernorm_bwd_gamma_beta_impl<F16, F16, F16, F32, F16, F16, rank>(
|
||||
do_verification, init_method, do_log, time_kernel, length);
|
||||
}
|
||||
else if(data_type == ck::DataTypeEnum::Float)
|
||||
{
|
||||
ck::profiler::profile_layernorm_bwd_gamma_beta_impl<F32, F32, F32, F32, F32, F32, rank>(
|
||||
do_verification, init_method, do_log, time_kernel, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("not implemented yet");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("not implemented yet");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("layernorm_bwd_gamma_beta",
|
||||
"Layer Normalization",
|
||||
profile_layernorm_bwd_gamma_beta);
|
||||
@@ -140,6 +140,7 @@ add_subdirectory(block_to_ctile_map)
|
||||
add_subdirectory(softmax)
|
||||
add_subdirectory(normalization_fwd)
|
||||
add_subdirectory(normalization_bwd_data)
|
||||
add_subdirectory(normalization_bwd_gamma_beta)
|
||||
add_subdirectory(data_type)
|
||||
add_subdirectory(elementwise_normalization)
|
||||
add_subdirectory(batchnorm)
|
||||
|
||||
13
test/normalization_bwd_gamma_beta/CMakeLists.txt
Normal file
13
test/normalization_bwd_gamma_beta/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
add_custom_target(test_normalization_bwd_gamma_beta)
|
||||
add_gtest_executable(test_layernorm2d_bwd_gamma_beta_fp32 test_layernorm2d_bwd_gamma_beta_fp32.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_layernorm2d_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance)
|
||||
add_dependencies(test_normalization_bwd_gamma_beta test_layernorm2d_bwd_gamma_beta_fp32)
|
||||
endif()
|
||||
|
||||
add_gtest_executable(test_groupnorm_bwd_gamma_beta_fp32 test_groupnorm_bwd_gamma_beta_fp32.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_groupnorm_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance)
|
||||
add_dependencies(test_normalization_bwd_gamma_beta test_groupnorm_bwd_gamma_beta_fp32)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using ck::index_t;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestgroupnormBwdGammaBeta : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using DYDataType = std::tuple_element_t<0, Tuple>;
|
||||
using XDataType = std::tuple_element_t<1, Tuple>;
|
||||
using MeanInvStdDataType = std::tuple_element_t<2, Tuple>;
|
||||
using ComputeDataType = std::tuple_element_t<3, Tuple>;
|
||||
using DGammaDataType = std::tuple_element_t<4, Tuple>;
|
||||
using DBetaDataType = std::tuple_element_t<5, Tuple>;
|
||||
|
||||
void Run()
|
||||
{
|
||||
// Bwd data: [N, H, W, G, C], reduce H, W, C
|
||||
std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
|
||||
{1, 2, 3, 4, 5},
|
||||
{256, 9, 9, 9, 9},
|
||||
{1, 64, 64, 32, 10},
|
||||
{1, 32, 32, 32, 20},
|
||||
{1, 16, 16, 32, 40}};
|
||||
|
||||
for(auto length : lengths)
|
||||
{
|
||||
bool success = ck::profiler::profile_groupnorm_bwd_gamma_beta_impl<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType>(
|
||||
true, 2, false, false, length);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using KernelTypes = ::testing::Types<
|
||||
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
|
||||
std::tuple<F32, F32, F32, F32, F32, F32>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestgroupnormBwdGammaBeta, KernelTypes);
|
||||
TYPED_TEST(TestgroupnormBwdGammaBeta, Test_FP32) { this->Run(); }
|
||||
@@ -0,0 +1,48 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using ck::index_t;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestLayernorm2dBwdGammaBeta : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using DYDataType = std::tuple_element_t<0, Tuple>;
|
||||
using XDataType = std::tuple_element_t<1, Tuple>;
|
||||
using MeanInvStdDataType = std::tuple_element_t<2, Tuple>;
|
||||
using ComputeDataType = std::tuple_element_t<3, Tuple>;
|
||||
using DGammaDataType = std::tuple_element_t<4, Tuple>;
|
||||
using DBetaDataType = std::tuple_element_t<5, Tuple>;
|
||||
|
||||
void Run()
|
||||
{
|
||||
// Bwd data: [N, D], reduce D
|
||||
std::vector<std::vector<ck::index_t>> lengths = {
|
||||
{4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
|
||||
|
||||
for(auto length : lengths)
|
||||
{
|
||||
bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
2>(
|
||||
true, 2, false, false, length);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using KernelTypes = ::testing::Types<
|
||||
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
|
||||
std::tuple<F32, F32, F32, F32, F32, F32>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes);
|
||||
TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }
|
||||
Reference in New Issue
Block a user