mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)
* Implement multiple-reduction in one kernel (kernels, device ops, examples)
* Add generic elementwise kernel and device interface
* Add generator for normal-distributed data initialization
* Add host refer implementation of batchnorm-forward and batchnorm-infer
* Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels
* Remove un-needed including in batchnorm example
* Renaming generic_elementwise to elementiwise in kernel and device classes/functions
* Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise
* Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise
* Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise
* Add DeviceElementwiseBase and use it in device_normalize_instance.cpp
* Removing and renaming files
* Update to synchronize gemm_layernorm client example to the generic element-wise device op API
* Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming
* Merge two static member functions in device_elementwise.hpp
* Remove unary_elementwise_1d kernel and device
[ROCm/composable_kernel commit: 53ea4713af]
This commit is contained in:
@@ -128,11 +128,14 @@ bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
|
||||
std::array<void*, 1> output = {p_y};
|
||||
auto normalize_functor = ck::tensor_operation::element_wise::Normalize{};
|
||||
|
||||
auto argument_ptr = p_op->MakeArgumentPointer(input,
|
||||
std::array<ck::index_t, 2> xyLengths = {M, N};
|
||||
std::array<ck::index_t, 2> xyStrides = {StrideX, 1};
|
||||
|
||||
auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
|
||||
{xyStrides},
|
||||
input,
|
||||
output,
|
||||
{M, N},
|
||||
{{StrideX, 1}, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
|
||||
{{StrideX, 1}},
|
||||
ck::tensor_operation::element_wise::Normalize{});
|
||||
|
||||
if(p_op->IsSupportedArgument(argument_ptr.get()))
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -16,28 +16,23 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
using EltwiseComputeDataType = F32;
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
|
||||
ABDataType,
|
||||
CDataType,
|
||||
EltwiseComputeDataType,
|
||||
Add,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
8,
|
||||
8>;
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
2,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
typename HostTensorC,
|
||||
typename ComputeDataType,
|
||||
typename Functor,
|
||||
int broadcastDim>
|
||||
void host_broadcast2D(
|
||||
@@ -49,19 +44,19 @@ void host_broadcast2D(
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType Amn = ck::type_convert<ComputeDataType>(A(m, n));
|
||||
ComputeDataType Cmn = 0;
|
||||
auto Amn = A(m, n);
|
||||
ctype Cmn = 0;
|
||||
if constexpr(broadcastDim == 0)
|
||||
{
|
||||
ComputeDataType Bn = ck::type_convert<ComputeDataType>(B(n));
|
||||
auto Bn = B(n);
|
||||
functor(Cmn, Amn, Bn);
|
||||
}
|
||||
else
|
||||
{
|
||||
ComputeDataType Bm = ck::type_convert<ComputeDataType>(B(m));
|
||||
auto Bm = B(m);
|
||||
functor(Cmn, Amn, Bm);
|
||||
}
|
||||
C(m, n) = ck::type_convert<ctype>(Cmn);
|
||||
C(m, n) = Cmn;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,18 +98,19 @@ int main()
|
||||
b_n_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {c_m_n_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::vector<ck::index_t> a_strides = {Stride, 1};
|
||||
std::vector<ck::index_t> b_strides = {0, 1};
|
||||
std::vector<ck::index_t> c_strides = {Stride, 1};
|
||||
std::array<ck::index_t, 2> abc_lengths = {M, N};
|
||||
std::array<ck::index_t, 2> a_strides = {Stride, 1};
|
||||
std::array<ck::index_t, 2> b_strides = {0, 1};
|
||||
std::array<ck::index_t, 2> c_strides = {Stride, 1};
|
||||
|
||||
auto broadcastAdd = DeviceElementwiseAddInstance{};
|
||||
auto argument = broadcastAdd.MakeArgumentPointer(
|
||||
input, output, {M, N}, {a_strides, b_strides}, {c_strides}, Add{});
|
||||
abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
|
||||
|
||||
if(!broadcastAdd.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"DeviceBinaryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
|
||||
@@ -129,12 +125,8 @@ int main()
|
||||
c_m_n_device_buf.FromDevice(c_m_n.mData.data());
|
||||
Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride));
|
||||
|
||||
host_broadcast2D<Tensor<ABDataType>,
|
||||
Tensor<ABDataType>,
|
||||
Tensor<CDataType>,
|
||||
EltwiseComputeDataType,
|
||||
Add,
|
||||
0>(host_c_m_n, a_m_n, b_n, M, N, Add{});
|
||||
host_broadcast2D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add, 0>(
|
||||
host_c_m_n, a_m_n, b_n, M, N, Add{});
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results c", 1e-3, 1e-3);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -16,29 +16,21 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
using EltwiseComputeDataType = F32;
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
|
||||
ABDataType,
|
||||
CDataType,
|
||||
EltwiseComputeDataType,
|
||||
Add,
|
||||
3,
|
||||
8,
|
||||
1,
|
||||
8,
|
||||
8>;
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
3,
|
||||
8,
|
||||
ck::Sequence<1, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
typename HostTensorC,
|
||||
typename ComputeDataType,
|
||||
typename Functor>
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_broadcast3D_am_bmnk(HostTensorC& C,
|
||||
const HostTensorA& A,
|
||||
const HostTensorB& B,
|
||||
@@ -51,11 +43,11 @@ void host_broadcast3D_am_bmnk(HostTensorC& C,
|
||||
for(std::size_t n = 0; n < shape[1]; ++n)
|
||||
for(std::size_t k = 0; k < shape[2]; ++k)
|
||||
{
|
||||
ComputeDataType a_val = ck::type_convert<ComputeDataType>(A(m));
|
||||
ComputeDataType b_val = ck::type_convert<ComputeDataType>(B(m, n, k));
|
||||
ComputeDataType c_val = 0;
|
||||
auto a_val = A(m);
|
||||
auto b_val = B(m, n, k);
|
||||
ctype c_val = 0;
|
||||
functor(c_val, a_val, b_val);
|
||||
C(m, n, k) = ck::type_convert<ctype>(c_val);
|
||||
C(m, n, k) = c_val;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,25 +77,25 @@ int main()
|
||||
b_m_n_k_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {c_m_n_k_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::vector<ck::index_t> a_strides = {1, 0, 0};
|
||||
std::vector<ck::index_t> b_strides{b_m_n_k.mDesc.GetStrides().begin(),
|
||||
b_m_n_k.mDesc.GetStrides().end()};
|
||||
std::vector<ck::index_t> c_strides{c_m_n_k.mDesc.GetStrides().begin(),
|
||||
c_m_n_k.mDesc.GetStrides().end()};
|
||||
std::array<ck::index_t, 3> abc_lengths;
|
||||
std::array<ck::index_t, 3> a_strides = {1, 0, 0};
|
||||
std::array<ck::index_t, 3> b_strides;
|
||||
std::array<ck::index_t, 3> c_strides;
|
||||
|
||||
std::copy(mnk.begin(), mnk.end(), abc_lengths.begin());
|
||||
std::copy(
|
||||
b_m_n_k.mDesc.GetStrides().begin(), b_m_n_k.mDesc.GetStrides().end(), b_strides.begin());
|
||||
std::copy(
|
||||
c_m_n_k.mDesc.GetStrides().begin(), c_m_n_k.mDesc.GetStrides().end(), c_strides.begin());
|
||||
|
||||
auto broadcastAdd = DeviceElementwiseAddInstance{};
|
||||
auto argument =
|
||||
broadcastAdd.MakeArgumentPointer(input,
|
||||
output,
|
||||
std::vector<ck::index_t>{mnk.begin(), mnk.end()},
|
||||
{a_strides, b_strides},
|
||||
{c_strides},
|
||||
Add{});
|
||||
auto argument = broadcastAdd.MakeArgumentPointer(
|
||||
abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
|
||||
|
||||
if(!broadcastAdd.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"DeviceBinaryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
|
||||
@@ -118,11 +110,8 @@ int main()
|
||||
c_m_n_k_device_buf.FromDevice(c_m_n_k.mData.data());
|
||||
Tensor<CDataType> host_c_m_n_k(mnk);
|
||||
|
||||
host_broadcast3D_am_bmnk<Tensor<ABDataType>,
|
||||
Tensor<ABDataType>,
|
||||
Tensor<CDataType>,
|
||||
EltwiseComputeDataType,
|
||||
Add>(host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});
|
||||
host_broadcast3D_am_bmnk<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
|
||||
host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
c_m_n_k.mData, host_c_m_n_k.mData, "Error: Incorrect results c", 1e-3, 1e-3);
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -15,29 +15,21 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
using EltwiseComputeDataType = F32;
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
|
||||
ABDataType,
|
||||
CDataType,
|
||||
EltwiseComputeDataType,
|
||||
Add,
|
||||
1,
|
||||
8,
|
||||
8,
|
||||
8,
|
||||
8>;
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
1,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
typename HostTensorC,
|
||||
typename ComputeDataType,
|
||||
typename Functor>
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_elementwise1D(
|
||||
HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor)
|
||||
{
|
||||
@@ -45,11 +37,11 @@ void host_elementwise1D(
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
ComputeDataType Am = ck::type_convert<ComputeDataType>(A(m));
|
||||
ComputeDataType Bm = ck::type_convert<ComputeDataType>(B(m));
|
||||
ComputeDataType Cm = 0;
|
||||
auto Am = A(m);
|
||||
auto Bm = B(m);
|
||||
ctype Cm = 0;
|
||||
functor(Cm, Am, Bm);
|
||||
C(m) = ck::type_convert<ctype>(Cm);
|
||||
C(m) = Cm;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,18 +75,19 @@ int main()
|
||||
b_m_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {c_m_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::vector<ck::index_t> a_strides = {1};
|
||||
std::vector<ck::index_t> b_strides = {1};
|
||||
std::vector<ck::index_t> c_strides = {1};
|
||||
std::array<ck::index_t, 1> abc_lengths = {M};
|
||||
std::array<ck::index_t, 1> a_strides = {1};
|
||||
std::array<ck::index_t, 1> b_strides = {1};
|
||||
std::array<ck::index_t, 1> c_strides = {1};
|
||||
|
||||
auto broadcastAdd = DeviceElementwiseAddInstance{};
|
||||
auto argument = broadcastAdd.MakeArgumentPointer(
|
||||
input, output, {M}, {{a_strides}, b_strides}, {c_strides}, Add{});
|
||||
abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
|
||||
|
||||
if(!broadcastAdd.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"DeviceBinaryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
|
||||
@@ -109,11 +102,8 @@ int main()
|
||||
c_m_device_buf.FromDevice(c_m.mData.data());
|
||||
Tensor<CDataType> host_c_m(f_host_tensor_descriptor1d(M, 1));
|
||||
|
||||
host_elementwise1D<Tensor<ABDataType>,
|
||||
Tensor<ABDataType>,
|
||||
Tensor<CDataType>,
|
||||
EltwiseComputeDataType,
|
||||
Add>(host_c_m, a_m, b_m, M, Add{});
|
||||
host_elementwise1D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
|
||||
host_c_m, a_m, b_m, M, Add{});
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -16,29 +16,21 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
using EltwiseComputeDataType = F32;
|
||||
using ABDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
|
||||
ABDataType,
|
||||
CDataType,
|
||||
EltwiseComputeDataType,
|
||||
Add,
|
||||
4,
|
||||
8,
|
||||
8,
|
||||
8,
|
||||
8>;
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
typename HostTensorC,
|
||||
typename ComputeDataType,
|
||||
typename Functor>
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_elementwise4D(HostTensorC& C,
|
||||
const HostTensorA& A,
|
||||
const HostTensorB& B,
|
||||
@@ -52,11 +44,11 @@ void host_elementwise4D(HostTensorC& C,
|
||||
for(std::size_t h = 0; h < shape[2]; ++h)
|
||||
for(std::size_t w = 0; w < shape[3]; ++w)
|
||||
{
|
||||
ComputeDataType a_val = ck::type_convert<ComputeDataType>(A(n, c, h, w));
|
||||
ComputeDataType b_val = ck::type_convert<ComputeDataType>(B(n, c, h, w));
|
||||
ComputeDataType c_val = 0;
|
||||
auto a_val = A(n, c, h, w);
|
||||
auto b_val = B(n, c, h, w);
|
||||
ctype c_val = 0;
|
||||
functor(c_val, a_val, b_val);
|
||||
C(n, c, h, w) = ck::type_convert<ctype>(c_val);
|
||||
C(n, c, h, w) = c_val;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,23 +77,24 @@ int main()
|
||||
b_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {c_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::vector<ck::index_t> a_strides{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()};
|
||||
std::vector<ck::index_t> b_strides{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()};
|
||||
std::vector<ck::index_t> c_strides{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()};
|
||||
std::array<ck::index_t, 4> abc_lengths;
|
||||
std::array<ck::index_t, 4> a_strides;
|
||||
std::array<ck::index_t, 4> b_strides;
|
||||
std::array<ck::index_t, 4> c_strides;
|
||||
|
||||
std::copy(nchw.begin(), nchw.end(), abc_lengths.begin());
|
||||
std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin());
|
||||
std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin());
|
||||
std::copy(c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end(), c_strides.begin());
|
||||
|
||||
auto broadcastAdd = DeviceElementwiseAddInstance{};
|
||||
auto argument =
|
||||
broadcastAdd.MakeArgumentPointer(input,
|
||||
output,
|
||||
std::vector<ck::index_t>{nchw.begin(), nchw.end()},
|
||||
{{a_strides}, b_strides},
|
||||
{c_strides},
|
||||
Add{});
|
||||
auto argument = broadcastAdd.MakeArgumentPointer(
|
||||
abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
|
||||
|
||||
if(!broadcastAdd.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"DeviceBinaryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
|
||||
@@ -116,11 +109,8 @@ int main()
|
||||
c_device_buf.FromDevice(c.mData.data());
|
||||
Tensor<CDataType> host_c(nchw);
|
||||
|
||||
host_elementwise4D<Tensor<ABDataType>,
|
||||
Tensor<ABDataType>,
|
||||
Tensor<CDataType>,
|
||||
EltwiseComputeDataType,
|
||||
Add>(host_c, a, b, nchw, Add{});
|
||||
host_elementwise4D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
|
||||
host_c, a, b, nchw, Add{});
|
||||
|
||||
pass &=
|
||||
ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results c", 1e-3, 1e-3);
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -94,23 +94,18 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
|
||||
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
|
||||
|
||||
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
|
||||
using DeviceNormalizeInstance =
|
||||
ck::tensor_operation::device::Device5AryElementwise<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType,
|
||||
NormalizeComputeDataType,
|
||||
NormalizeFunctor,
|
||||
2,
|
||||
8,
|
||||
8, // scalarPerVector: gemm_out
|
||||
1, // scalarPerVector: reduce_mean
|
||||
1, // scalarPerVector: reduce_mean_square
|
||||
8, // scalarPerVector: Gamma
|
||||
8, // scalarPerVector: Beta
|
||||
8>; // scalarPerVector: LayerNorm_out
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType>, // x(gemm_out), mean, meansquare, gamma, beta
|
||||
ck::Tuple<LayerNormOutDataType>, // y
|
||||
NormalizeFunctor,
|
||||
2,
|
||||
8, // MPerthread
|
||||
ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
|
||||
ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out)
|
||||
|
||||
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({len}),
|
||||
@@ -197,14 +192,9 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
NormalizeComputeDataType out_acc = 0;
|
||||
layerNormInst(out_acc,
|
||||
ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
|
||||
ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
|
||||
ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
|
||||
ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
|
||||
ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
|
||||
out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
|
||||
LayerNormOutDataType out_val = 0;
|
||||
layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
|
||||
out_m_n(m, n) = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -339,28 +329,28 @@ int main()
|
||||
beta_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {layerNorm_device_buf.GetDeviceBuffer()};
|
||||
|
||||
auto normalize = DeviceNormalizeInstance{};
|
||||
auto normalize_invoker = normalize.MakeInvoker();
|
||||
auto normalize_argument = normalize.MakeArgument(input,
|
||||
output,
|
||||
{M, N},
|
||||
{StrideE, 1},
|
||||
{1, 0},
|
||||
{1, 0},
|
||||
{0, 1},
|
||||
{0, 1},
|
||||
{StrideE, 1},
|
||||
NormalizeFunctor{});
|
||||
std::array<ck::index_t, 2> xyLengths = {M, N};
|
||||
std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
|
||||
|
||||
if(!normalize.IsSupportedArgument(normalize_argument))
|
||||
auto normalize = DeviceNormalizeInstance{};
|
||||
auto normalize_invoker = normalize.MakeInvoker();
|
||||
auto normalize_argument_ptr =
|
||||
normalize.MakeArgumentPointer(xyLengths,
|
||||
{xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
|
||||
{xyStrides},
|
||||
input,
|
||||
output,
|
||||
NormalizeFunctor{});
|
||||
|
||||
if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"Device5AryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device, exiting!");
|
||||
}
|
||||
|
||||
// run kernel
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
|
||||
normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
bool pass = true;
|
||||
{
|
||||
@@ -396,7 +386,7 @@ int main()
|
||||
float gemm_reduce_mean_reduce_square_mean_ave_time =
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
|
||||
float normalize_ave_time =
|
||||
normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
if(time_kernel)
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -91,23 +91,20 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
|
||||
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
|
||||
|
||||
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
|
||||
using DeviceNormalizeInstance =
|
||||
ck::tensor_operation::device::Device5AryElementwise<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType,
|
||||
NormalizeComputeDataType,
|
||||
NormalizeFunctor,
|
||||
2,
|
||||
8,
|
||||
8, // scalarPerVector: gemm_out
|
||||
1, // scalarPerVector: reduce_mean
|
||||
1, // scalarPerVector: reduce_mean_square
|
||||
8, // scalarPerVector: Gamma
|
||||
8, // scalarPerVector: Beta
|
||||
8>; // scalarPerVector: LayerNorm_out
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType>, // x(gemm_out), mean,
|
||||
// meansquare,
|
||||
// gamma, beta
|
||||
ck::Tuple<LayerNormOutDataType>, // y
|
||||
NormalizeFunctor,
|
||||
2,
|
||||
8, // MPerthread
|
||||
ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
|
||||
ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out)
|
||||
|
||||
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({len}),
|
||||
@@ -139,7 +136,6 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
|
||||
int M,
|
||||
int N)
|
||||
{
|
||||
|
||||
int StrideE = N;
|
||||
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
|
||||
Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
|
||||
@@ -184,14 +180,9 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
NormalizeComputeDataType out_acc = 0;
|
||||
layerNormInst(out_acc,
|
||||
ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
|
||||
ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
|
||||
ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
|
||||
ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
|
||||
ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
|
||||
out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
|
||||
LayerNormOutDataType out_val = 0;
|
||||
layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
|
||||
out_m_n(m, n) = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -314,28 +305,28 @@ int main()
|
||||
beta_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {layerNorm_device_buf.GetDeviceBuffer()};
|
||||
|
||||
auto normalize = DeviceNormalizeInstance{};
|
||||
auto normalize_invoker = normalize.MakeInvoker();
|
||||
auto normalize_argument = normalize.MakeArgument(input,
|
||||
output,
|
||||
{M, N},
|
||||
{StrideE, 1},
|
||||
{1, 0},
|
||||
{1, 0},
|
||||
{0, 1},
|
||||
{0, 1},
|
||||
{StrideE, 1},
|
||||
NormalizeFunctor{});
|
||||
std::array<ck::index_t, 2> xyLengths = {M, N};
|
||||
std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
|
||||
|
||||
if(!normalize.IsSupportedArgument(normalize_argument))
|
||||
auto normalize = DeviceNormalizeInstance{};
|
||||
auto normalize_invoker = normalize.MakeInvoker();
|
||||
auto normalize_argument_ptr =
|
||||
normalize.MakeArgumentPointer(xyLengths,
|
||||
{xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
|
||||
{xyStrides},
|
||||
input,
|
||||
output,
|
||||
NormalizeFunctor{});
|
||||
|
||||
if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error("The runtime parameters seems not supported by the "
|
||||
"Device5AryElementwise instance, exiting!");
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device, exiting");
|
||||
}
|
||||
|
||||
// run kernel
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
|
||||
normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, false});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
bool pass = true;
|
||||
{
|
||||
@@ -369,7 +360,7 @@ int main()
|
||||
float gemm_reduce_mean_reduce_square_mean_ave_time =
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
|
||||
float normalize_ave_time =
|
||||
normalize_invoker.Run(normalize_argument, StreamConfig{nullptr, time_kernel});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
if(time_kernel)
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
|
||||
2
example/33_multiple_reduce/CMakeLists.txt
Normal file
2
example/33_multiple_reduce/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_example_executable(example_dual_reduce_multiblock dual_reduce_multiblock.cpp)
|
||||
add_example_executable(example_dual_reduce_threadwise dual_reduce_threadwise.cpp)
|
||||
37
example/33_multiple_reduce/README.md
Normal file
37
example/33_multiple_reduce/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Instructions for ```example_dual_reduce```
|
||||
|
||||
## Run ```example_dual_reduce_multiblock```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg2: time kernel (0=no, 1=yes)
|
||||
./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1
|
||||
launch_and_time_kernel: grid_dim {150, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1>
|
||||
```
|
||||
|
||||
## Run ```example_dual_reduce_threadwise```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg2: time kernel (0=no, 1=yes)
|
||||
./bin/example_dual_reduce_multiblock -D 8000,4,4,4 -v 1 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_dual_reduce_threadwise -D 8000,4,4,4 -v 1 2 1
|
||||
launch_and_time_kernel: grid_dim {32, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1>
|
||||
```
|
||||
313
example/33_multiple_reduce/dual_reduce_common.hpp
Normal file
313
example/33_multiple_reduce/dual_reduce_common.hpp
Normal file
@@ -0,0 +1,313 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class SimpleAppArgs
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inLengths = {600, 28, 28, 256};
|
||||
size_t n, h, w, c;
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 2;
|
||||
bool time_kernel = true;
|
||||
|
||||
public:
|
||||
SimpleAppArgs()
|
||||
{
|
||||
n = inLengths[0];
|
||||
h = inLengths[1];
|
||||
w = inLengths[2];
|
||||
c = inLengths[3];
|
||||
};
|
||||
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
|
||||
"comparing with the host-based reduction"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
if(inLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"Invalid option format! The number of integers is incorrect!");
|
||||
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 2 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
n = inLengths[0];
|
||||
h = inLengths[1];
|
||||
w = inLengths[2];
|
||||
c = inLengths[3];
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
template <typename InDataType, typename OutDataType1, typename OutDataType2, typename AccDataType>
|
||||
static void mean_meansquare_host(const Tensor<InDataType>& in,
|
||||
Tensor<OutDataType1>& mean_ref,
|
||||
Tensor<OutDataType2>& meansquare_ref,
|
||||
size_t n,
|
||||
size_t h,
|
||||
size_t w,
|
||||
size_t c)
|
||||
|
||||
{
|
||||
auto thread_reduce_func = [&](auto iN) {
|
||||
AccDataType mean = ck::type_convert<AccDataType>(0.0f);
|
||||
AccDataType meansquare = ck::type_convert<AccDataType>(0.0f);
|
||||
|
||||
// compute mean, meanquare, variance, invVariance
|
||||
for(std::size_t iH = 0; iH < h; iH++)
|
||||
{
|
||||
for(std::size_t iW = 0; iW < w; iW++)
|
||||
{
|
||||
for(std::size_t iC = 0; iC < c; iC++)
|
||||
{
|
||||
AccDataType curr_value = ck::type_convert<AccDataType>(in(iN, iH, iW, iC));
|
||||
|
||||
mean += curr_value;
|
||||
meansquare += curr_value * curr_value;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
mean = mean / (h * w * c);
|
||||
meansquare = meansquare / (h * w * c);
|
||||
|
||||
mean_ref(iN) = ck::type_convert<OutDataType1>(mean);
|
||||
meansquare_ref(iN) = ck::type_convert<OutDataType2>(meansquare);
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t work_per_thread = (n + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; it++)
|
||||
{
|
||||
std::size_t iN_begin = it * work_per_thread;
|
||||
std::size_t iN_end = std::min(static_cast<size_t>((it + 1) * work_per_thread), n);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t iN = iN_begin; iN < iN_end; iN++)
|
||||
{
|
||||
thread_reduce_func(iN);
|
||||
}
|
||||
};
|
||||
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
};
|
||||
|
||||
using ReduceOperation = ck::reduce::Add;
|
||||
|
||||
using InElementwiseOperation_Mean = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
|
||||
using InElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
|
||||
using InElementwiseOperationTuple =
|
||||
ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>;
|
||||
using AccElementwiseOperationTuple =
|
||||
ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>;
|
||||
|
||||
template <typename DeviceDualReduce,
|
||||
typename InDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
int Rank,
|
||||
int NumReduceDim>
|
||||
int mean_meansquare_dual_reduce_test(size_t n,
|
||||
size_t h,
|
||||
size_t w,
|
||||
size_t c,
|
||||
bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::array<int, NumReduceDim> reduceDims)
|
||||
{
|
||||
const std::vector<size_t> inLengths = {n, h, w, c};
|
||||
|
||||
Tensor<InDataType> in(inLengths);
|
||||
|
||||
std::vector<size_t> outLengths{n};
|
||||
|
||||
Tensor<OutDataType> mean_ref(outLengths);
|
||||
Tensor<OutDataType> mean(outLengths);
|
||||
Tensor<OutDataType> meansquare_ref(outLengths);
|
||||
Tensor<OutDataType> meansquare(outLengths);
|
||||
|
||||
auto inStrides = in.mDesc.GetStrides();
|
||||
auto outStrides = mean.mDesc.GetStrides();
|
||||
|
||||
size_t invariant_total_length = n;
|
||||
size_t reduce_total_length = h * w * c;
|
||||
|
||||
const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
|
||||
const AccDataType beta = ck::type_convert<AccDataType>(0.0f);
|
||||
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1: in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread); break;
|
||||
case 2: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread); break;
|
||||
default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpaceSize());
|
||||
DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_dev.ToDevice(in.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
mean_meansquare_host<InDataType, OutDataType, OutDataType, AccDataType>(
|
||||
in, mean_ref, meansquare_ref, n, h, w, c);
|
||||
};
|
||||
|
||||
constexpr ck::index_t NumInputDim = Rank;
|
||||
constexpr ck::index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1;
|
||||
|
||||
std::array<ck::index_t, NumInputDim> i_inLengths;
|
||||
std::array<ck::index_t, NumInputDim> i_inStrides;
|
||||
std::array<ck::index_t, NumOutputDim> i_outLengths;
|
||||
std::array<ck::index_t, NumOutputDim> i_outStrides;
|
||||
|
||||
std::copy(inLengths.begin(), inLengths.end(), i_inLengths.begin());
|
||||
std::copy(inStrides.begin(), inStrides.end(), i_inStrides.begin());
|
||||
std::copy(outLengths.begin(), outLengths.end(), i_outLengths.begin());
|
||||
std::copy(outStrides.begin(), outStrides.end(), i_outStrides.begin());
|
||||
|
||||
auto dual_reduce_op = DeviceDualReduce{};
|
||||
|
||||
auto argument_ptr = dual_reduce_op.MakeArgumentPointer(
|
||||
i_inLengths,
|
||||
i_inStrides,
|
||||
i_outLengths,
|
||||
{i_outStrides, i_outStrides},
|
||||
reduceDims,
|
||||
{&alpha, &alpha},
|
||||
{&beta, &beta},
|
||||
in_dev.GetDeviceBuffer(),
|
||||
{mean_dev.GetDeviceBuffer(), meansquare_dev.GetDeviceBuffer()},
|
||||
ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
|
||||
ck::make_tuple(
|
||||
AccElementwiseOperation_Mean{static_cast<int32_t>(reduce_total_length)},
|
||||
AccElementwiseOperation_Meansquare{static_cast<int32_t>(reduce_total_length)}));
|
||||
|
||||
if(!dual_reduce_op.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
|
||||
<< std::endl;
|
||||
return (-1);
|
||||
};
|
||||
|
||||
std::string reduce_name = dual_reduce_op.GetTypeString();
|
||||
|
||||
auto invoker_ptr = dual_reduce_op.MakeInvokerPointer();
|
||||
|
||||
float avg_time = 0.0f;
|
||||
|
||||
avg_time += invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
|
||||
2 * invariant_total_length * sizeof(OutDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
|
||||
<< std::endl;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
mean_dev.FromDevice(mean.mData.data());
|
||||
meansquare_dev.FromDevice(meansquare.mData.data());
|
||||
pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData);
|
||||
pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
98
example/33_multiple_reduce/dual_reduce_multiblock.cpp
Normal file
98
example/33_multiple_reduce/dual_reduce_multiblock.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
|
||||
#include "dual_reduce_common.hpp"
|
||||
|
||||
using namespace ck;
|
||||
using namespace ck::tensor_operation::device;
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = float;
|
||||
using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
|
||||
using AccDataType = float;
|
||||
|
||||
// for NHWC layer-norm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
constexpr bool PropagateNan = false;
|
||||
|
||||
constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::Set;
|
||||
|
||||
using DeviceDualReduce = DeviceMultipleReduceMultiBlock<2,
|
||||
InDataType,
|
||||
AccDataType,
|
||||
OutDataTypeTuple,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple,
|
||||
OutMemoryDataOperation,
|
||||
PropagateNan,
|
||||
256,
|
||||
4,
|
||||
64,
|
||||
1,
|
||||
1,
|
||||
1, // InSrcVectorDim
|
||||
1,
|
||||
ck::Sequence<1, 1>>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int retval = 0;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
SimpleAppArgs arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
|
||||
|
||||
retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
Rank,
|
||||
NumReduceDim>(arg.n,
|
||||
arg.h,
|
||||
arg.w,
|
||||
arg.c,
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
reduceDims);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
|
||||
|
||||
retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
Rank,
|
||||
NumReduceDim>(
|
||||
600, 28, 28, 256, true, 2, true, reduceDims);
|
||||
};
|
||||
|
||||
return (retval);
|
||||
}
|
||||
93
example/33_multiple_reduce/dual_reduce_threadwise.cpp
Normal file
93
example/33_multiple_reduce/dual_reduce_threadwise.cpp
Normal file
@@ -0,0 +1,93 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
|
||||
#include "dual_reduce_common.hpp"
|
||||
|
||||
using namespace ck;
|
||||
using namespace ck::tensor_operation::device;
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = float;
|
||||
using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
|
||||
using AccDataType = float;
|
||||
|
||||
// for NHWC layer-norm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
constexpr bool PropagateNan = false;
|
||||
|
||||
using DeviceDualReduce = DeviceMultipleReduceThreadWise<2,
|
||||
InDataType,
|
||||
AccDataType,
|
||||
OutDataTypeTuple,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple,
|
||||
PropagateNan,
|
||||
256,
|
||||
1,
|
||||
4,
|
||||
1, // InSrcVectorDim
|
||||
2,
|
||||
ck::Sequence<1, 1>>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int retval = 0;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
SimpleAppArgs arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
|
||||
|
||||
retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
Rank,
|
||||
NumReduceDim>(arg.n,
|
||||
arg.h,
|
||||
arg.w,
|
||||
arg.c,
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
reduceDims);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
|
||||
|
||||
retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
Rank,
|
||||
NumReduceDim>(
|
||||
8000, 4, 4, 4, true, 2, true, reduceDims);
|
||||
};
|
||||
|
||||
return (retval);
|
||||
}
|
||||
2
example/34_batchnorm/CMakeLists.txt
Normal file
2
example/34_batchnorm/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)
|
||||
56
example/34_batchnorm/README.md
Normal file
56
example/34_batchnorm/README.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Instructions for ```batchnorm nhwc``` Example
|
||||
|
||||
## Run ```batchnorm forward nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)
|
||||
#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes)
|
||||
#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg5: time kernel (0=no, 1=yes)
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
|
||||
launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 2.08231 ms, 354.519 GB/s
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0
|
||||
echo $?
|
||||
0
|
||||
```
|
||||
|
||||
## Run ```batchnorm infer nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg3: time kernel (0=no, 1=yes)
|
||||
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 1.28235 ms, 523.329 GB/s
|
||||
```
|
||||
|
||||
|
||||
181
example/34_batchnorm/batchnorm_common.hpp
Normal file
181
example/34_batchnorm/batchnorm_common.hpp
Normal file
@@ -0,0 +1,181 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
// binary operation used to calculate invVariance from mean and meansquare
|
||||
struct InvVariance
|
||||
{
|
||||
InvVariance(double epsilon) : epsilon_(epsilon){};
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr void operator()(T& y, const T& mean, const T& meansquare) const
|
||||
{
|
||||
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T tmp_epsilon = type_convert<T>(epsilon_);
|
||||
|
||||
y = meansquare - mean * mean;
|
||||
y = 1.0f / sqrt(tmp_epsilon + y);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
// (4-in, 2-out) element-wise operation used to update the moving average of mean and variance
|
||||
struct MovingAverage
|
||||
{
|
||||
MovingAverage(double factor) : factor_(factor){};
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr void operator()(T& y0,
|
||||
T& y1,
|
||||
const T& mean,
|
||||
const T& runningMean,
|
||||
const T& meansquare,
|
||||
const T& runningVariance) const
|
||||
{
|
||||
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
|
||||
T tmp_factor = type_convert<T>(factor_);
|
||||
T variance = meansquare - mean * mean;
|
||||
|
||||
y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
|
||||
y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
|
||||
};
|
||||
|
||||
double factor_;
|
||||
};
|
||||
|
||||
struct MovingAverageAndInvVariance
|
||||
{
|
||||
MovingAverageAndInvVariance(double epsilon, double factor)
|
||||
: epsilon_(epsilon), factor_(factor){};
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr void operator()(T& y0, // resultRunningMean
|
||||
T& y1, // resultRunningVariance
|
||||
T& y2, // saveInvVariance
|
||||
const T& mean,
|
||||
const T& runningMean,
|
||||
const T& meansquare,
|
||||
const T& runningVariance) const
|
||||
{
|
||||
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T tmp_epsilon = type_convert<T>(epsilon_);
|
||||
T tmp_factor = type_convert<T>(factor_);
|
||||
T variance = meansquare - mean * mean;
|
||||
|
||||
y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
|
||||
y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
|
||||
|
||||
y2 = 1.0f / sqrt(tmp_epsilon + variance);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
double factor_;
|
||||
};
|
||||
|
||||
struct NormalizeInInfer
|
||||
{
|
||||
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
__host__ __device__ constexpr void operator()(T1& y,
|
||||
const T1& x,
|
||||
const T2& mean,
|
||||
const T2& variance,
|
||||
const T2& gamma,
|
||||
const T2& beta) const
|
||||
{
|
||||
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T2 tmp_x, tmp_y;
|
||||
|
||||
tmp_x = type_convert<T2>(x);
|
||||
|
||||
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
|
||||
y = type_convert<T1>(tmp_y);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
struct NormalizeInForward
|
||||
{
|
||||
NormalizeInForward(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
__host__ __device__ constexpr void operator()(T1& y,
|
||||
const T1& x,
|
||||
const T2& mean,
|
||||
const T2& meansquare,
|
||||
const T2& gamma,
|
||||
const T2& beta) const
|
||||
{
|
||||
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T2 tmp_x, tmp_y;
|
||||
T2 variance = meansquare - mean * mean;
|
||||
|
||||
tmp_x = type_convert<T2>(x);
|
||||
|
||||
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
|
||||
y = type_convert<T1>(tmp_y);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
template <int Rank, int NumReduceDim>
|
||||
static inline std::array<int, Rank - NumReduceDim>
|
||||
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
|
||||
{
|
||||
int reduceFlag = 0;
|
||||
|
||||
// flag the bits for the reduceDims
|
||||
for(int i = 0; i < NumReduceDim; i++)
|
||||
{
|
||||
reduceFlag |= 1 << reduceDims[i];
|
||||
};
|
||||
|
||||
std::array<int, Rank - NumReduceDim> invariantDims;
|
||||
|
||||
// collect invariant dimensions
|
||||
int dim = 0;
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) == 0)
|
||||
{
|
||||
invariantDims[dim] = i;
|
||||
dim++;
|
||||
};
|
||||
|
||||
return invariantDims;
|
||||
};
|
||||
295
example/34_batchnorm/batchnorm_forward_impl.hpp
Normal file
295
example/34_batchnorm/batchnorm_forward_impl.hpp
Normal file
@@ -0,0 +1,295 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "batchnorm_common.hpp"
|
||||
|
||||
template <typename InOutDataType,
|
||||
typename AccDataType,
|
||||
ck::index_t Rank,
|
||||
ck::index_t NumBatchNormReduceDim,
|
||||
bool fastest_dim_is_reduced = false>
|
||||
int bnorm_fwd(bool time_kernel,
|
||||
bool updateMovingAverage,
|
||||
bool saveMeanAndInvVariance,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, Rank> xyLengths,
|
||||
const std::array<ck::index_t, Rank> xStrides,
|
||||
const std::array<ck::index_t, Rank> yStrides,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_scale,
|
||||
const void* p_bias,
|
||||
void* p_y,
|
||||
double exponentialAverageFactor,
|
||||
void* p_runningMean,
|
||||
void* p_runningVariance,
|
||||
double epsilon,
|
||||
void* p_saveMean,
|
||||
void* p_saveInvVariance,
|
||||
void* p_tmp_mean,
|
||||
void* p_tmp_meansquare)
|
||||
{
|
||||
static_assert(NumBatchNormReduceDim < Rank,
|
||||
"Invalid number of reduced dimensions for batchnorm!");
|
||||
|
||||
constexpr ck::index_t NumScaleBiasMeanVarDim = Rank - NumBatchNormReduceDim;
|
||||
|
||||
using InElementwiseOperation_Mean = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
|
||||
using InElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
|
||||
using DeviceMeanAndMeansquareInstance =
|
||||
ck::tensor_operation::device::DeviceMultipleReduceMultiBlock<
|
||||
2,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
ck::Tuple<AccDataType, AccDataType>,
|
||||
Rank,
|
||||
NumBatchNormReduceDim,
|
||||
ck::reduce::Add,
|
||||
ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>,
|
||||
ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>,
|
||||
ck::InMemoryDataOperationEnum::Set,
|
||||
false, // PropagateNan
|
||||
256,
|
||||
16,
|
||||
16,
|
||||
1,
|
||||
1,
|
||||
fastest_dim_is_reduced ? 1 : 0,
|
||||
1,
|
||||
ck::Sequence<1, 1>>;
|
||||
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
|
||||
// meansquare,
|
||||
// scale, bias
|
||||
ck::Tuple<InOutDataType>, // y
|
||||
NormalizeInForward,
|
||||
Rank,
|
||||
2, // MPerthread
|
||||
ck::Sequence<1, 1, 1, 1, 1>, // scalarPerVector: x, mean, meansquare, scale, bias
|
||||
ck::Sequence<1>>; // scalarPerVector: y
|
||||
|
||||
using DeviceInvVarianceInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<AccDataType, AccDataType>, // mean, meansquare
|
||||
ck::Tuple<AccDataType>, // invVariance
|
||||
InvVariance,
|
||||
NumScaleBiasMeanVarDim,
|
||||
2, // MPerthread
|
||||
ck::Sequence<1, 1>, // scalarPerVector: mean, meansquare
|
||||
ck::Sequence<1>>; // scalarPerVector: invVariance
|
||||
|
||||
using DeviceMovingAverageInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new mean,
|
||||
// old moving variance, new
|
||||
// meansquare
|
||||
ck::Tuple<AccDataType, AccDataType>, // updated moving mean, updated moving variance
|
||||
MovingAverage,
|
||||
NumScaleBiasMeanVarDim,
|
||||
4, // MPerthread
|
||||
ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
|
||||
// variance, new meansquare
|
||||
ck::Sequence<1, 1>>; // scalarPerVector: updated moving mean, updated moving variance
|
||||
|
||||
using DeviceMovingAverageAndInvVarianceInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new
|
||||
// mean, old moving
|
||||
// variance, new
|
||||
// meansquare
|
||||
ck::Tuple<AccDataType, AccDataType, AccDataType>, // updated moving mean, updated moving
|
||||
// variancem, invVariance
|
||||
MovingAverageAndInvVariance,
|
||||
NumScaleBiasMeanVarDim,
|
||||
4, // MPerthread
|
||||
ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
|
||||
// variance, new meansquare
|
||||
ck::Sequence<1, 1, 1>>; // scalarPerVector: updated moving mean, updated moving variance
|
||||
|
||||
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
|
||||
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
int32_t reduceLength = 1;
|
||||
|
||||
for(auto dim : reduceDims)
|
||||
reduceLength *= xyLengths[dim];
|
||||
|
||||
int32_t invariantLength = 1;
|
||||
|
||||
for(auto dim : invariantDims)
|
||||
invariantLength *= xyLengths[dim];
|
||||
|
||||
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
|
||||
|
||||
float avg_time = 0.0f;
|
||||
std::size_t num_bytes = 0;
|
||||
|
||||
auto dev_mean_and_meansquare = DeviceMeanAndMeansquareInstance{};
|
||||
|
||||
void* p_mean = saveMeanAndInvVariance ? p_saveMean : p_tmp_mean;
|
||||
|
||||
const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
|
||||
const AccDataType beta = ck::type_convert<AccDataType>(0.0f);
|
||||
|
||||
auto argument_ptr1 = dev_mean_and_meansquare.MakeArgumentPointer(
|
||||
xyLengths,
|
||||
xStrides,
|
||||
bnScaleBiasMeanVarLengths,
|
||||
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
|
||||
reduceDims,
|
||||
{&alpha, &alpha},
|
||||
{&beta, &beta},
|
||||
p_x,
|
||||
{p_mean, p_tmp_meansquare},
|
||||
ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
|
||||
ck::make_tuple(AccElementwiseOperation_Mean{reduceLength},
|
||||
AccElementwiseOperation_Meansquare{reduceLength}));
|
||||
|
||||
auto dev_normalize = DeviceNormalizeInstance{};
|
||||
|
||||
auto argument_ptr2 =
|
||||
dev_normalize.MakeArgumentPointer(xyLengths,
|
||||
{xStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{yStrides},
|
||||
{p_x, p_mean, p_tmp_meansquare, p_scale, p_bias},
|
||||
{p_y},
|
||||
NormalizeInForward{epsilon});
|
||||
|
||||
if(!dev_mean_and_meansquare.IsSupportedArgument(argument_ptr1.get()) ||
|
||||
!dev_normalize.IsSupportedArgument(argument_ptr2.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
|
||||
<< std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr1 = dev_mean_and_meansquare.MakeInvokerPointer();
|
||||
auto invoker_ptr2 = dev_normalize.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
|
||||
avg_time += invoker_ptr2->Run(argument_ptr2.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes +=
|
||||
(total_length * sizeof(InOutDataType) + invariantLength * 2 * sizeof(AccDataType)) + // No.1
|
||||
(total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
|
||||
total_length * sizeof(InOutDataType)); // No.2
|
||||
|
||||
if(saveMeanAndInvVariance && updateMovingAverage)
|
||||
{
|
||||
auto dev_moving_average_inv_variance = DeviceMovingAverageAndInvVarianceInstance{};
|
||||
|
||||
auto argument_ptr3 = dev_moving_average_inv_variance.MakeArgumentPointer(
|
||||
bnScaleBiasMeanVarLengths,
|
||||
{bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides},
|
||||
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
|
||||
{p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
|
||||
{p_runningMean, p_runningVariance, p_saveInvVariance},
|
||||
MovingAverageAndInvVariance{epsilon, exponentialAverageFactor});
|
||||
|
||||
if(!dev_moving_average_inv_variance.IsSupportedArgument(argument_ptr3.get()))
|
||||
{
|
||||
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr3 = dev_moving_average_inv_variance.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes += invariantLength * (4 + 3) * sizeof(AccDataType) * 2; // No.5
|
||||
}
|
||||
else if(saveMeanAndInvVariance)
|
||||
{
|
||||
auto dev_inv_variance = DeviceInvVarianceInstance{};
|
||||
auto argument_ptr3 = dev_inv_variance.MakeArgumentPointer(
|
||||
bnScaleBiasMeanVarLengths,
|
||||
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
|
||||
{bnScaleBiasMeanVarStrides},
|
||||
{p_mean, p_tmp_meansquare},
|
||||
{p_saveInvVariance},
|
||||
InvVariance{epsilon});
|
||||
|
||||
if(!dev_inv_variance.IsSupportedArgument(argument_ptr3.get()))
|
||||
{
|
||||
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr3 = dev_inv_variance.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes += invariantLength * (2 + 1) * sizeof(AccDataType);
|
||||
}
|
||||
else if(updateMovingAverage)
|
||||
{
|
||||
auto dev_moving_average = DeviceMovingAverageInstance{};
|
||||
|
||||
auto argument_ptr3 = dev_moving_average.MakeArgumentPointer(
|
||||
bnScaleBiasMeanVarLengths,
|
||||
{bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
bnScaleBiasMeanVarStrides},
|
||||
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
|
||||
{p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
|
||||
{p_runningMean, p_runningVariance},
|
||||
MovingAverage{exponentialAverageFactor});
|
||||
|
||||
if(!dev_moving_average.IsSupportedArgument(argument_ptr3.get()))
|
||||
{
|
||||
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr3 = dev_moving_average.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes += invariantLength * (4 + 2) * sizeof(AccDataType) * 2; // No.5
|
||||
};
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
};
|
||||
|
||||
return (0);
|
||||
};
|
||||
466
example/34_batchnorm/batchnorm_forward_nhwc.cpp
Normal file
466
example/34_batchnorm/batchnorm_forward_nhwc.cpp
Normal file
@@ -0,0 +1,466 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp"
|
||||
|
||||
#include "batchnorm_forward_impl.hpp"
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
using ReferenceBatchNormFwdInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormFwd_Input_N_H_W_C_Output_C<InOutDataType,
|
||||
AccDataType>;
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormFwdArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
bool updateMovingAverage;
|
||||
bool saveMeanAndInvVariance;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
|
||||
"lengths, must have 4 integers for nhwc"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
|
||||
"result by "
|
||||
"comparing with the host-based batch-normalization"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
|
||||
"value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 5 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
updateMovingAverage = std::atoi(argv[optind++]);
|
||||
saveMeanAndInvVariance = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
bool bnorm_fwd_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
bool updateMovingAverage,
|
||||
bool saveMeanAndInvVariance,
|
||||
double averageFactor,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> y_ref(inOutLengths);
|
||||
Tensor<InOutDataType> y(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
{
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 2.5f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the runningMean to be values with tiny variation to the mean of the x
|
||||
// values
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
// initialize the runningVariance to be values with tiny variation to the variance of
|
||||
// the x values
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
else
|
||||
x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// mean_dev or resultSaveMean_dev
|
||||
DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
|
||||
resultSaveMean_ref.mDesc.GetElementSpaceSize());
|
||||
// meansquare_dev or resultSaveInvVariance_dev
|
||||
DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
|
||||
resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningMean_dev
|
||||
DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
|
||||
resultRunningMean_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningVariance_dev
|
||||
DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
|
||||
resultRunningVariance_ref.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
bnBias_dev.ToDevice(bnBias.mData.data());
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
|
||||
resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
|
||||
};
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
|
||||
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
|
||||
std::copy(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
i_scaleBiasMeanVarLengths.begin());
|
||||
std::copy(scaleBiasMeanVarStrides.begin(),
|
||||
scaleBiasMeanVarStrides.end(),
|
||||
i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
int result = 0;
|
||||
|
||||
// used for saving meansquare
|
||||
DeviceMem workspace(sizeof(AccDataType) * 2 * resultSaveMean_ref.mDesc.GetElementSpaceSize() +
|
||||
128);
|
||||
|
||||
void* p_tmp_mean = workspace.GetDeviceBuffer();
|
||||
void* p_tmp_meansquare =
|
||||
static_cast<char*>(p_tmp_mean) +
|
||||
(sizeof(AccDataType) * resultSaveMean_ref.mDesc.GetElementSpaceSize() + 63) / 64 * 64;
|
||||
|
||||
result = bnorm_fwd<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
|
||||
time_kernel,
|
||||
updateMovingAverage,
|
||||
saveMeanAndInvVariance,
|
||||
{0, 1, 2},
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
bnBias_dev.GetDeviceBuffer(),
|
||||
y_dev.GetDeviceBuffer(),
|
||||
averageFactor,
|
||||
updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
|
||||
updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr,
|
||||
epsilon,
|
||||
saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
|
||||
p_tmp_mean,
|
||||
p_tmp_meansquare);
|
||||
|
||||
if(result < 0)
|
||||
return (false);
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto batchNormFwd_ref = ReferenceBatchNormFwdInstance<InOutDataType, AccDataType>{};
|
||||
|
||||
auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
bnBias.mData.data(),
|
||||
y_ref.mData.data(),
|
||||
0.1, // exponentialAverageFactor
|
||||
updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr, // resultRunningMean
|
||||
updateMovingAverage ? resultRunningVariance_ref.mData.data()
|
||||
: nullptr, // resultRunningVariance
|
||||
epsilon,
|
||||
saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr);
|
||||
|
||||
if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
|
||||
<< std::endl;
|
||||
return (-2);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
|
||||
resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
|
||||
|
||||
pass =
|
||||
pass && ck::utils::check_err(resultRunningMean.mData, resultRunningMean_ref.mData);
|
||||
pass = pass && ck::utils::check_err(resultRunningVariance.mData,
|
||||
resultRunningVariance_ref.mData);
|
||||
};
|
||||
|
||||
if(saveMeanAndInvVariance)
|
||||
{
|
||||
Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
|
||||
resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
|
||||
|
||||
pass = pass && ck::utils::check_err(resultSaveMean.mData, resultSaveMean_ref.mData);
|
||||
pass = pass && ck::utils::check_err(resultSaveInvVariance.mData,
|
||||
resultSaveInvVariance_ref.mData);
|
||||
};
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
static const double averageFactor = 0.1;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormFwdArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<float, float>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<int8_t, float>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<double, double>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 16, 1024},
|
||||
true,
|
||||
false,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
119
example/34_batchnorm/batchnorm_infer_impl.hpp
Normal file
119
example/34_batchnorm/batchnorm_infer_impl.hpp
Normal file
@@ -0,0 +1,119 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "batchnorm_common.hpp"
|
||||
|
||||
template <typename InOutDataType,
|
||||
typename AccDataType,
|
||||
ck::index_t Rank,
|
||||
ck::index_t NumBatchNormReduceDim,
|
||||
bool fastest_dim_is_reduced = false>
|
||||
int bnorm_infer(
|
||||
bool time_kernel,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, Rank> xyLengths,
|
||||
const std::array<ck::index_t, Rank> xStrides,
|
||||
const std::array<ck::index_t, Rank> yStrides,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_scale,
|
||||
const void* p_bias,
|
||||
double epsilon,
|
||||
const void* p_estimatedMean,
|
||||
const void* p_estimatedVariance,
|
||||
void* p_y)
|
||||
{
|
||||
(void)bnScaleBiasMeanVarLengths;
|
||||
|
||||
static_assert(NumBatchNormReduceDim < Rank,
|
||||
"Invalid number of reduced dimensions for batchnorm!");
|
||||
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
|
||||
// variance,
|
||||
// scale,
|
||||
// bias,
|
||||
ck::Tuple<InOutDataType>, // y
|
||||
NormalizeInInfer,
|
||||
Rank,
|
||||
2, // MPerthread
|
||||
ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
|
||||
ck::Sequence<1>>; // scalarPerVector: y
|
||||
|
||||
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
|
||||
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
int32_t reduceLength = 1;
|
||||
|
||||
for(auto dim : reduceDims)
|
||||
reduceLength *= xyLengths[dim];
|
||||
|
||||
int32_t invariantLength = 1;
|
||||
|
||||
for(auto dim : invariantDims)
|
||||
invariantLength *= xyLengths[dim];
|
||||
|
||||
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
|
||||
|
||||
float avg_time = 0.0f;
|
||||
std::size_t num_bytes = 0;
|
||||
|
||||
auto dev_normalize = DeviceNormalizeInstance{};
|
||||
|
||||
auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
|
||||
xyLengths,
|
||||
{xStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{yStrides},
|
||||
{p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
|
||||
{p_y},
|
||||
NormalizeInInfer{epsilon});
|
||||
|
||||
if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
|
||||
<< std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
|
||||
total_length * sizeof(InOutDataType));
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
};
|
||||
|
||||
return (0);
|
||||
};
|
||||
346
example/34_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
346
example/34_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
@@ -0,0 +1,346 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp"
|
||||
|
||||
#include "batchnorm_infer_impl.hpp"
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
using ReferenceBatchNormInferInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C<InOutDataType,
|
||||
AccDataType>;
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormInferArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
|
||||
"lengths, must have 4 integers for nhwc"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
|
||||
"result by "
|
||||
"comparing with the host-based batch-normalization"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer "
|
||||
"value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 3 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
bool bnorm_infer_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> y_ref(inOutLengths);
|
||||
Tensor<InOutDataType> y(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> estimatedMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> estimatedVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
{
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 2.5f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
estimatedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the savedMean to be values with tiny variation to the mean of the x values
|
||||
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
// initialize the variance to be values with tiny variation to the variance of the x values
|
||||
estimatedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// mean_dev or resultSaveMean_dev
|
||||
DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize());
|
||||
// meansquare_dev or resultSaveInvVariance_dev
|
||||
DeviceMem estimatedVariance_dev(sizeof(AccDataType) *
|
||||
estimatedVariance.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
bnBias_dev.ToDevice(bnBias.mData.data());
|
||||
estimatedMean_dev.ToDevice(estimatedMean.mData.data());
|
||||
estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
|
||||
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
|
||||
std::copy(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
i_scaleBiasMeanVarLengths.begin());
|
||||
std::copy(scaleBiasMeanVarStrides.begin(),
|
||||
scaleBiasMeanVarStrides.end(),
|
||||
i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
int result = 0;
|
||||
|
||||
result = bnorm_infer<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
|
||||
time_kernel,
|
||||
{0, 1, 2},
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
bnBias_dev.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
estimatedMean_dev.GetDeviceBuffer(),
|
||||
estimatedVariance_dev.GetDeviceBuffer(),
|
||||
y_dev.GetDeviceBuffer());
|
||||
|
||||
if(result < 0)
|
||||
return (false);
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto batchNormInfer_ref = ReferenceBatchNormInferInstance<InOutDataType, AccDataType>{};
|
||||
|
||||
auto argument_ptr_ref =
|
||||
batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
bnBias.mData.data(),
|
||||
epsilon,
|
||||
estimatedMean.mData.data(),
|
||||
estimatedVariance.mData.data(),
|
||||
y_ref.mData.data());
|
||||
|
||||
if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
|
||||
<< std::endl;
|
||||
return (-2);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
static const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormInferArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::half_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<float, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<int8_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::bhalf_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<double, double>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::half_t, float>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 16, 1024},
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
@@ -47,3 +47,6 @@ add_subdirectory(29_batched_gemm_bias_e_permute)
|
||||
add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
|
||||
add_subdirectory(31_batched_gemm_gemm)
|
||||
add_subdirectory(32_batched_gemm_softmax_gemm)
|
||||
add_subdirectory(33_multiple_reduce)
|
||||
add_subdirectory(34_batchnorm)
|
||||
|
||||
|
||||
@@ -1,353 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename DDataType,
|
||||
typename EDataType,
|
||||
typename FDataType,
|
||||
typename ComputeDataType,
|
||||
typename ElementwiseFunctor,
|
||||
index_t NDim,
|
||||
index_t MPerThread,
|
||||
index_t AScalarPerVector,
|
||||
index_t BScalarPerVector,
|
||||
index_t CScalarPerVector,
|
||||
index_t DScalarPerVector,
|
||||
index_t EScalarPerVector,
|
||||
index_t FScalarPerVector>
|
||||
struct Device5AryElementwise : public DeviceElementwise<5, 1, NDim, ElementwiseFunctor>
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <typename Desc_M>
|
||||
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
|
||||
{
|
||||
const auto m = desc_m.GetLength(I0);
|
||||
const index_t loop_step = gridSize * blockSize * MPerThread;
|
||||
const auto pad = math::integer_least_multiple(m, loop_step) - m;
|
||||
const auto desc_m_pad =
|
||||
transform_tensor_descriptor(desc_m,
|
||||
make_tuple(make_right_pad_transform(m, pad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return desc_m_pad;
|
||||
}
|
||||
|
||||
static auto MakeDescriptor_M(const std::vector<index_t>& lengths,
|
||||
const std::vector<index_t>& stride,
|
||||
index_t gridSize,
|
||||
index_t blockSize)
|
||||
{
|
||||
auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<NDim>{});
|
||||
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NDim>{});
|
||||
|
||||
// nd desc - [s0, s1, s2, ...]
|
||||
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
|
||||
|
||||
// merge nd to 1d desc - [s0 * s1 * ...]
|
||||
if constexpr(NDim > 1)
|
||||
{
|
||||
const auto desc_m = transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform(tupleOfShape)),
|
||||
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NDim>{})),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
|
||||
}
|
||||
else
|
||||
return PadDescriptor_M_1d(desc, gridSize, blockSize);
|
||||
}
|
||||
|
||||
using AGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using BGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using DGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using EGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using FGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
|
||||
using Gridwise5AryEltwise = Gridwise5AryElementwise_1D<ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
DDataType,
|
||||
EDataType,
|
||||
FDataType,
|
||||
ComputeDataType,
|
||||
AGridDesc_M,
|
||||
BGridDesc_M,
|
||||
CGridDesc_M,
|
||||
DGridDesc_M,
|
||||
EGridDesc_M,
|
||||
FGridDesc_M,
|
||||
ElementwiseFunctor,
|
||||
MPerThread,
|
||||
AScalarPerVector,
|
||||
BScalarPerVector,
|
||||
CScalarPerVector,
|
||||
DScalarPerVector,
|
||||
EScalarPerVector,
|
||||
FScalarPerVector>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const ADataType* p_a,
|
||||
const BDataType* p_b,
|
||||
const CDataType* p_c,
|
||||
const DDataType* p_d,
|
||||
const EDataType* p_e,
|
||||
FDataType* p_f,
|
||||
const std::vector<index_t>& lengths,
|
||||
const std::vector<index_t>& a_strides,
|
||||
const std::vector<index_t>& b_strides,
|
||||
const std::vector<index_t>& c_strides,
|
||||
const std::vector<index_t>& d_strides,
|
||||
const std::vector<index_t>& e_strides,
|
||||
const std::vector<index_t>& f_strides,
|
||||
ElementwiseFunctor functor)
|
||||
: p_a_(p_a),
|
||||
p_b_(p_b),
|
||||
p_c_(p_c),
|
||||
p_d_(p_d),
|
||||
p_e_(p_e),
|
||||
p_f_(p_f),
|
||||
lengths_(lengths),
|
||||
a_strides_(a_strides),
|
||||
b_strides_(b_strides),
|
||||
c_strides_(c_strides),
|
||||
d_strides_(d_strides),
|
||||
e_strides_(e_strides),
|
||||
f_strides_(f_strides),
|
||||
functor_(functor),
|
||||
blockSize_(256),
|
||||
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
|
||||
{
|
||||
a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_);
|
||||
b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_);
|
||||
c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_);
|
||||
d_grid_desc_m_ = MakeDescriptor_M(lengths, d_strides, gridSize_, blockSize_);
|
||||
e_grid_desc_m_ = MakeDescriptor_M(lengths, e_strides, gridSize_, blockSize_);
|
||||
f_grid_desc_m_ = MakeDescriptor_M(lengths, f_strides, gridSize_, blockSize_);
|
||||
}
|
||||
|
||||
const ADataType* p_a_;
|
||||
const BDataType* p_b_;
|
||||
const CDataType* p_c_;
|
||||
const DDataType* p_d_;
|
||||
const EDataType* p_e_;
|
||||
FDataType* p_f_;
|
||||
std::vector<index_t> lengths_;
|
||||
AGridDesc_M a_grid_desc_m_;
|
||||
BGridDesc_M b_grid_desc_m_;
|
||||
CGridDesc_M c_grid_desc_m_;
|
||||
DGridDesc_M d_grid_desc_m_;
|
||||
EGridDesc_M e_grid_desc_m_;
|
||||
FGridDesc_M f_grid_desc_m_;
|
||||
std::vector<index_t> a_strides_;
|
||||
std::vector<index_t> b_strides_;
|
||||
std::vector<index_t> c_strides_;
|
||||
std::vector<index_t> d_strides_;
|
||||
std::vector<index_t> e_strides_;
|
||||
std::vector<index_t> f_strides_;
|
||||
ElementwiseFunctor functor_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto kernel = kernel_5ary_elementwise_1d<Gridwise5AryEltwise,
|
||||
ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
DDataType,
|
||||
EDataType,
|
||||
FDataType,
|
||||
AGridDesc_M,
|
||||
BGridDesc_M,
|
||||
CGridDesc_M,
|
||||
DGridDesc_M,
|
||||
EGridDesc_M,
|
||||
FGridDesc_M,
|
||||
ElementwiseFunctor>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.p_a_,
|
||||
arg.p_b_,
|
||||
arg.p_c_,
|
||||
arg.p_d_,
|
||||
arg.p_e_,
|
||||
arg.p_f_,
|
||||
arg.a_grid_desc_m_,
|
||||
arg.b_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.d_grid_desc_m_,
|
||||
arg.e_grid_desc_m_,
|
||||
arg.f_grid_desc_m_,
|
||||
arg.functor_);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument& p_arg) { return IsSupportedArgument(&p_arg); }
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg == nullptr)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.size() != NDim)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.back() % MPerThread != 0)
|
||||
return false;
|
||||
|
||||
auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
|
||||
bool ret = true;
|
||||
|
||||
if(!isLastDimensionCoalesced)
|
||||
ret = scalarPerVector == 1;
|
||||
else
|
||||
ret = MPerThread % scalarPerVector == 0;
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->d_strides_.back() == 1, DScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->e_strides_.back() == 1, EScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->f_strides_.back() == 1, FScalarPerVector))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
static auto MakeArgument(std::array<const void*, 5> p_inputs,
|
||||
std::array<void*, 1> p_outputs,
|
||||
std::vector<index_t> lengths,
|
||||
std::vector<index_t> a_strides,
|
||||
std::vector<index_t> b_strides,
|
||||
std::vector<index_t> c_strides,
|
||||
std::vector<index_t> d_strides,
|
||||
std::vector<index_t> e_strides,
|
||||
std::vector<index_t> f_strides,
|
||||
ElementwiseFunctor functor)
|
||||
{
|
||||
return Argument{static_cast<const ADataType*>(p_inputs[0]),
|
||||
static_cast<const BDataType*>(p_inputs[1]),
|
||||
static_cast<const CDataType*>(p_inputs[2]),
|
||||
static_cast<const DDataType*>(p_inputs[3]),
|
||||
static_cast<const EDataType*>(p_inputs[4]),
|
||||
static_cast<FDataType*>(p_outputs[0]),
|
||||
lengths,
|
||||
a_strides,
|
||||
b_strides,
|
||||
c_strides,
|
||||
d_strides,
|
||||
e_strides,
|
||||
f_strides,
|
||||
functor};
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(std::array<const void*, 5> p_inputs,
|
||||
std::array<void*, 1> p_outputs,
|
||||
std::vector<index_t> lengths,
|
||||
std::vector<std::vector<index_t>> input_strides,
|
||||
std::vector<std::vector<index_t>> output_strides,
|
||||
ElementwiseFunctor functor) override
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
|
||||
static_cast<const BDataType*>(p_inputs[1]),
|
||||
static_cast<const CDataType*>(p_inputs[2]),
|
||||
static_cast<const DDataType*>(p_inputs[3]),
|
||||
static_cast<const EDataType*>(p_inputs[4]),
|
||||
static_cast<FDataType*>(p_outputs[0]),
|
||||
lengths,
|
||||
input_strides[0],
|
||||
input_strides[1],
|
||||
input_strides[2],
|
||||
input_strides[3],
|
||||
input_strides[4],
|
||||
output_strides[0],
|
||||
functor);
|
||||
}
|
||||
|
||||
static auto MakeInvoker() { return Invoker{}; }
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "Device5aryElementwise"
|
||||
<< "<"
|
||||
<< "NDim = " << NDim
|
||||
<< "MPerThread = " << MPerThread
|
||||
<< "AScalarPerVector = " << AScalarPerVector
|
||||
<< "BScalarPerVector = " << BScalarPerVector
|
||||
<< "CScalarPerVector = " << CScalarPerVector
|
||||
<< "DScalarPerVector = " << DScalarPerVector
|
||||
<< "EScalarPerVector = " << EScalarPerVector
|
||||
<< "FScalarPerVector = " << FScalarPerVector
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,44 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <memory>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim>
|
||||
struct DeviceBatchNormFwd : public BaseOperator
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const std::array<index_t, Rank> xyLengths,
|
||||
const std::array<index_t, Rank> xStrides,
|
||||
const std::array<index_t, Rank> yStrides,
|
||||
const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* bnScale,
|
||||
const void* bnBias,
|
||||
void* p_y,
|
||||
double exponentialAverageFactor,
|
||||
void* resultRunningMean,
|
||||
void* resultRunningVariance,
|
||||
double epsilon,
|
||||
void* resultSaveMean,
|
||||
void* resultSaveInvVariance) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim>
|
||||
using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,41 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <memory>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim>
|
||||
struct DeviceBatchNormInfer : public BaseOperator
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const std::array<index_t, Rank> xyLengths,
|
||||
const std::array<index_t, Rank> xStrides,
|
||||
const std::array<index_t, Rank> yStrides,
|
||||
const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* bnScale,
|
||||
const void* bnBias,
|
||||
double epsilon,
|
||||
const void* estimatedMean,
|
||||
const void* estimatedInvVariance,
|
||||
void* p_y) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim>
|
||||
using DeviceBatchNormInferPtr = std::unique_ptr<DeviceBatchNormInfer<Rank, NumBatchNormReduceDim>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,247 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename ComputeDataType,
|
||||
typename ElementwiseFunctor,
|
||||
index_t NDim,
|
||||
index_t MPerThread,
|
||||
index_t AScalarPerVector,
|
||||
index_t BScalarPerVector,
|
||||
index_t CScalarPerVector>
|
||||
struct DeviceBinaryElementwise : public DeviceElementwise<2, 1, NDim, ElementwiseFunctor>
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <typename Desc_M>
|
||||
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
|
||||
{
|
||||
const auto M = desc_m.GetLength(I0);
|
||||
const index_t loop_step = gridSize * blockSize * MPerThread;
|
||||
const auto pad = math::integer_least_multiple(M, loop_step) - M;
|
||||
const auto desc_m_pad =
|
||||
transform_tensor_descriptor(desc_m,
|
||||
make_tuple(make_right_pad_transform(M, pad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return desc_m_pad;
|
||||
}
|
||||
|
||||
static auto MakeDescriptor_M(const std::vector<index_t>& lengths,
|
||||
const std::vector<index_t>& strides,
|
||||
index_t gridSize,
|
||||
index_t blockSize)
|
||||
{
|
||||
auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<NDim>{});
|
||||
auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number<NDim>{});
|
||||
|
||||
// nd desc - [s0, s1, s2, ...]
|
||||
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
|
||||
|
||||
// merge nd to 1d desc - [s0 * s1 * ...]
|
||||
if constexpr(NDim > 1)
|
||||
{
|
||||
const auto desc_m = transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform(tupleOfShape)),
|
||||
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NDim>{})),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
|
||||
}
|
||||
else
|
||||
return PadDescriptor_M_1d(desc, gridSize, blockSize);
|
||||
}
|
||||
|
||||
using AGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using BGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
|
||||
using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
ComputeDataType,
|
||||
AGridDesc_M,
|
||||
BGridDesc_M,
|
||||
CGridDesc_M,
|
||||
ElementwiseFunctor,
|
||||
MPerThread,
|
||||
AScalarPerVector,
|
||||
BScalarPerVector,
|
||||
CScalarPerVector>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const ADataType* p_a,
|
||||
const BDataType* p_b,
|
||||
CDataType* p_c,
|
||||
const std::vector<index_t>& lengths,
|
||||
const std::vector<index_t>& a_strides,
|
||||
const std::vector<index_t>& b_strides,
|
||||
const std::vector<index_t>& c_strides,
|
||||
ElementwiseFunctor functor)
|
||||
: p_a_(p_a),
|
||||
p_b_(p_b),
|
||||
p_c_(p_c),
|
||||
lengths_(lengths),
|
||||
a_strides_(a_strides),
|
||||
b_strides_(b_strides),
|
||||
c_strides_(c_strides),
|
||||
functor_(functor),
|
||||
blockSize_(256),
|
||||
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
|
||||
{
|
||||
a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_);
|
||||
b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_);
|
||||
c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_);
|
||||
}
|
||||
|
||||
const ADataType* p_a_;
|
||||
const BDataType* p_b_;
|
||||
CDataType* p_c_;
|
||||
std::vector<int> lengths_;
|
||||
AGridDesc_M a_grid_desc_m_;
|
||||
BGridDesc_M b_grid_desc_m_;
|
||||
CGridDesc_M c_grid_desc_m_;
|
||||
std::vector<index_t> a_strides_;
|
||||
std::vector<index_t> b_strides_;
|
||||
std::vector<index_t> c_strides_;
|
||||
ElementwiseFunctor functor_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto kernel = kernel_binary_elementwise_1d<GridwiseBinEltwise,
|
||||
ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
AGridDesc_M,
|
||||
BGridDesc_M,
|
||||
CGridDesc_M,
|
||||
ElementwiseFunctor>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.p_a_,
|
||||
arg.p_b_,
|
||||
arg.p_c_,
|
||||
arg.a_grid_desc_m_,
|
||||
arg.b_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.functor_);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg == nullptr)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.size() != NDim)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.back() % MPerThread != 0)
|
||||
return false;
|
||||
|
||||
auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
|
||||
bool ret = true;
|
||||
|
||||
if(!isLastDimensionCoalesced)
|
||||
ret = scalarPerVector == 1;
|
||||
else
|
||||
ret = MPerThread % scalarPerVector == 0;
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector))
|
||||
return false;
|
||||
|
||||
if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(std::array<const void*, 2> p_inputs,
|
||||
std::array<void*, 1> p_outputs,
|
||||
std::vector<index_t> lengths,
|
||||
std::vector<std::vector<index_t>> input_strides,
|
||||
std::vector<std::vector<index_t>> output_strides,
|
||||
ElementwiseFunctor functor) override
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
|
||||
static_cast<const BDataType*>(p_inputs[1]),
|
||||
static_cast<CDataType*>(p_outputs[0]),
|
||||
lengths,
|
||||
input_strides[0],
|
||||
input_strides[1],
|
||||
output_strides[0],
|
||||
functor);
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceBinaryElementwise"
|
||||
<< "<"
|
||||
<< "NDim = " << NDim
|
||||
<< "MPerThread = " << MPerThread
|
||||
<< "AScalarPerVector = " << AScalarPerVector
|
||||
<< "BScalarPerVector = " << BScalarPerVector
|
||||
<< "CScalarPerVector = " << CScalarPerVector
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -14,7 +14,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_cgemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
@@ -538,48 +538,43 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
|
||||
|
||||
float ave_time = 0;
|
||||
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
using Subtract = ck::tensor_operation::element_wise::Subtract;
|
||||
using GridwiseBinAdd = GridwiseBinaryElementwise_1D<CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
Add,
|
||||
MPerThread,
|
||||
AScalarPerVector,
|
||||
BScalarPerVector,
|
||||
CScalarPerVector>;
|
||||
using GridwiseBinSubtract = GridwiseBinaryElementwise_1D<CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
Subtract,
|
||||
MPerThread,
|
||||
AScalarPerVector,
|
||||
BScalarPerVector,
|
||||
CScalarPerVector>;
|
||||
const auto add_kernel = kernel_binary_elementwise_1d<GridwiseBinAdd,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
Add>;
|
||||
const auto subtract_kernel = kernel_binary_elementwise_1d<GridwiseBinSubtract,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CDataType,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
CGridDesc_M,
|
||||
Subtract>;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
using Subtract = ck::tensor_operation::element_wise::Subtract;
|
||||
|
||||
using GridwiseBinAdd =
|
||||
GridwiseElementwise_1D<Tuple<CGridDesc_M, CGridDesc_M>,
|
||||
Tuple<CGridDesc_M>,
|
||||
Tuple<const CDataType*, const CDataType*>,
|
||||
Tuple<CDataType*>,
|
||||
Add,
|
||||
MPerThread,
|
||||
Sequence<AScalarPerVector, BScalarPerVector>,
|
||||
Sequence<CScalarPerVector>>;
|
||||
|
||||
using GridwiseBinSubtract =
|
||||
GridwiseElementwise_1D<Tuple<CGridDesc_M, CGridDesc_M>,
|
||||
Tuple<CGridDesc_M>,
|
||||
Tuple<const CDataType*, const CDataType*>,
|
||||
Tuple<CDataType*>,
|
||||
Subtract,
|
||||
MPerThread,
|
||||
Sequence<AScalarPerVector, BScalarPerVector>,
|
||||
Sequence<CScalarPerVector>>;
|
||||
|
||||
const auto add_kernel = kernel_elementwise_1d<GridwiseBinAdd,
|
||||
Tuple<CGridDesc_M, CGridDesc_M>,
|
||||
Tuple<CGridDesc_M>,
|
||||
Tuple<const CDataType*, const CDataType*>,
|
||||
Tuple<CDataType*>,
|
||||
Add>;
|
||||
|
||||
const auto subtract_kernel =
|
||||
kernel_elementwise_1d<GridwiseBinSubtract,
|
||||
Tuple<CGridDesc_M, CGridDesc_M>,
|
||||
Tuple<CGridDesc_M>,
|
||||
Tuple<const CDataType*, const CDataType*>,
|
||||
Tuple<CDataType*>,
|
||||
Subtract>;
|
||||
|
||||
if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
|
||||
{
|
||||
@@ -631,18 +626,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
|
||||
arg.block_2_ctile_map_);
|
||||
|
||||
// c_real = aux - aux_2
|
||||
ave_time += launch_and_time_kernel(stream_config,
|
||||
subtract_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_aux_grid_,
|
||||
arg.p_aux_2_grid_,
|
||||
arg.p_c_grid_real_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
Subtract{});
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
subtract_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
|
||||
make_tuple(arg.c_grid_desc_m_),
|
||||
make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
|
||||
const_cast<const CDataType*>(arg.p_aux_2_grid_)),
|
||||
make_tuple(arg.p_c_grid_real_),
|
||||
Subtract{});
|
||||
|
||||
ave_time +=
|
||||
launch_and_time_kernel(stream_config,
|
||||
@@ -679,18 +674,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
|
||||
arg.block_2_ctile_map_);
|
||||
|
||||
// c_imag = aux + aux_2
|
||||
ave_time += launch_and_time_kernel(stream_config,
|
||||
add_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_aux_grid_,
|
||||
arg.p_aux_2_grid_,
|
||||
arg.p_c_grid_imag_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
Add{});
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
add_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
|
||||
make_tuple(arg.c_grid_desc_m_),
|
||||
make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
|
||||
const_cast<const CDataType*>(arg.p_aux_2_grid_)),
|
||||
make_tuple(arg.p_c_grid_imag_),
|
||||
Add{});
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -742,18 +737,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
|
||||
arg.block_2_ctile_map_);
|
||||
|
||||
// c_real = aux - aux_2
|
||||
ave_time += launch_and_time_kernel(stream_config,
|
||||
subtract_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_aux_grid_,
|
||||
arg.p_aux_2_grid_,
|
||||
arg.p_c_grid_real_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
Subtract{});
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
subtract_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
|
||||
make_tuple(arg.c_grid_desc_m_),
|
||||
make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
|
||||
const_cast<const CDataType*>(arg.p_aux_2_grid_)),
|
||||
make_tuple(arg.p_c_grid_real_),
|
||||
Subtract{});
|
||||
|
||||
ave_time +=
|
||||
launch_and_time_kernel(stream_config,
|
||||
@@ -790,18 +785,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
|
||||
arg.block_2_ctile_map_);
|
||||
|
||||
// c_imag = aux + aux_2
|
||||
ave_time += launch_and_time_kernel(stream_config,
|
||||
add_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_aux_grid_,
|
||||
arg.p_aux_2_grid_,
|
||||
arg.p_c_grid_imag_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
arg.c_grid_desc_m_,
|
||||
Add{});
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
add_kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
|
||||
make_tuple(arg.c_grid_desc_m_),
|
||||
make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
|
||||
const_cast<const CDataType*>(arg.p_aux_2_grid_)),
|
||||
make_tuple(arg.p_c_grid_imag_),
|
||||
Add{});
|
||||
}
|
||||
|
||||
return ave_time;
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
|
||||
@@ -2,38 +2,286 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "device_base.hpp"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <ck::index_t NumInputTensor,
|
||||
ck::index_t NumOutputTensor,
|
||||
index_t NDim,
|
||||
typename ElementwiseFunctor>
|
||||
struct DeviceElementwise : public BaseOperator
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim,
|
||||
index_t MPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct DeviceElementwise
|
||||
: public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(std::array<const void*, NumInputTensor> p_inputs,
|
||||
std::array<void*, NumOutputTensor> p_outputs,
|
||||
std::vector<index_t> lengths,
|
||||
std::vector<std::vector<index_t>> input_strides,
|
||||
std::vector<std::vector<index_t>> output_strides,
|
||||
ElementwiseFunctor functor) = 0;
|
||||
static constexpr int NumInput = InDataTypeTuple::Size();
|
||||
static constexpr int NumOutput = OutDataTypeTuple::Size();
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
|
||||
NumOutput == OutScalarPerVectorSeq::Size(),
|
||||
"Tuple size is inconsistent with the number of in/out!");
|
||||
|
||||
template <ck::index_t NumInputTensor,
|
||||
ck::index_t NumOutputTensor,
|
||||
index_t NDim,
|
||||
typename ElementwiseFunctor>
|
||||
using DeviceElementwisePtr =
|
||||
std::unique_ptr<DeviceElementwise<NumInputTensor, NumOutputTensor, NDim, ElementwiseFunctor>>;
|
||||
static auto GenerateInDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<const DataType*>(nullptr);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
};
|
||||
|
||||
static auto GenerateOutDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<DataType*>(nullptr);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
};
|
||||
|
||||
using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple());
|
||||
using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
|
||||
|
||||
template <typename Desc_M>
|
||||
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
const auto m = desc_m.GetLength(I0);
|
||||
const index_t loop_step = gridSize * blockSize * MPerThread;
|
||||
const auto pad = math::integer_least_multiple(m, loop_step) - m;
|
||||
const auto desc_m_pad =
|
||||
transform_tensor_descriptor(desc_m,
|
||||
make_tuple(make_right_pad_transform(m, pad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return desc_m_pad;
|
||||
}
|
||||
|
||||
static auto MakeDescriptor_M(const std::array<index_t, NumDim>& lengths,
|
||||
const std::array<index_t, NumDim>& stride,
|
||||
index_t gridSize,
|
||||
index_t blockSize)
|
||||
{
|
||||
auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
|
||||
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
|
||||
|
||||
// nd desc - [s0, s1, s2, ...]
|
||||
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
|
||||
|
||||
// merge nd to 1d desc - [s0 * s1 * ...]
|
||||
if constexpr(NumDim > 1)
|
||||
{
|
||||
const auto desc_m = transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform(tupleOfShape)),
|
||||
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim>{})),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
|
||||
}
|
||||
else
|
||||
return PadDescriptor_M_1d(desc, gridSize, blockSize);
|
||||
}
|
||||
|
||||
template <index_t TupleSize>
|
||||
static auto GenerateInOutGrid1dDescTuple(Number<TupleSize>)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto) {
|
||||
if constexpr(NumDim > 1)
|
||||
{
|
||||
return MakeDescriptor_M({1, 1}, {1, 1}, 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
return MakeDescriptor_M({1}, {1}, 1, 1);
|
||||
};
|
||||
},
|
||||
Number<TupleSize>{});
|
||||
};
|
||||
|
||||
using InGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number<NumInput>{}));
|
||||
using OutGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number<NumOutput>{}));
|
||||
|
||||
using GridwiseElementwise = GridwiseElementwise_1D<InGrid1dDescTuple,
|
||||
OutGrid1dDescTuple,
|
||||
InDataTypePointerTuple,
|
||||
OutDataTypePointerTuple,
|
||||
ElementwiseOperation,
|
||||
MPerThread,
|
||||
InScalarPerVectorSeq,
|
||||
OutScalarPerVectorSeq>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, NumDim> lengths,
|
||||
const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
|
||||
const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
|
||||
const std::array<const void*, NumInput> in_dev_buffers,
|
||||
const std::array<void*, NumOutput> out_dev_buffers,
|
||||
ElementwiseOperation elementwise_op)
|
||||
|
||||
: lengths_(lengths),
|
||||
inStridesArray_(inStridesArray),
|
||||
outStridesArray_(outStridesArray),
|
||||
elementwise_op_(elementwise_op),
|
||||
blockSize_(256),
|
||||
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
|
||||
{
|
||||
in_dev_buffers_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
|
||||
return static_cast<const DataType*>(in_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
out_dev_buffers_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
return static_cast<DataType*>(out_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
in_grid_1d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
lengths, inStridesArray[I.value], gridSize_, blockSize_);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
out_grid_1d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
lengths, outStridesArray[I.value], gridSize_, blockSize_);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
}
|
||||
|
||||
InDataTypePointerTuple in_dev_buffers_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
InGrid1dDescTuple in_grid_1d_desc_tuple_;
|
||||
OutGrid1dDescTuple out_grid_1d_desc_tuple_;
|
||||
|
||||
std::array<index_t, NumDim> lengths_;
|
||||
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
|
||||
std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
|
||||
|
||||
ElementwiseOperation elementwise_op_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
|
||||
InGrid1dDescTuple,
|
||||
OutGrid1dDescTuple,
|
||||
InDataTypePointerTuple,
|
||||
OutDataTypePointerTuple,
|
||||
ElementwiseOperation>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.in_grid_1d_desc_tuple_,
|
||||
arg.out_grid_1d_desc_tuple_,
|
||||
arg.in_dev_buffers_,
|
||||
arg.out_dev_buffers_,
|
||||
arg.elementwise_op_);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg == nullptr)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.back() % MPerThread != 0)
|
||||
return false;
|
||||
|
||||
auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
|
||||
const std::array<index_t, NumDim>& strides,
|
||||
index_t scalarPerVector) {
|
||||
if(strides.back() == 1 && lengths.back() % scalarPerVector == 0)
|
||||
return true;
|
||||
|
||||
if(strides.back() != 1 && scalarPerVector == 1)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
bool valid = true;
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
if(!IsScalarPerVectorValid(
|
||||
pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
|
||||
valid = false;
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
if(!IsScalarPerVectorValid(
|
||||
pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
|
||||
valid = false;
|
||||
});
|
||||
|
||||
return valid;
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
|
||||
const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
|
||||
const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
|
||||
const std::array<const void*, NumInput> in_dev_buffers,
|
||||
const std::array<void*, NumOutput> out_dev_buffers,
|
||||
ElementwiseOperation elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(lengths,
|
||||
inStridesArray,
|
||||
outStridesArray,
|
||||
in_dev_buffers,
|
||||
out_dev_buffers,
|
||||
elementwise_op);
|
||||
}
|
||||
|
||||
static auto MakeInvoker() { return Invoker{}; }
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <array>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim>
|
||||
struct DeviceElementwiseBase : public BaseOperator
|
||||
{
|
||||
static constexpr int NumInput = InDataTypeTuple::Size();
|
||||
static constexpr int NumOutput = OutDataTypeTuple::Size();
|
||||
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
|
||||
const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
|
||||
const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
|
||||
const std::array<const void*, NumInput> in_dev_buffers,
|
||||
const std::array<void*, NumOutput> out_dev_buffers,
|
||||
ElementwiseOperation elementwise_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
}; // namespace device
|
||||
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim>
|
||||
using DeviceElementwiseBasePtr = std::unique_ptr<
|
||||
DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,58 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
index_t NumReduction,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple>
|
||||
struct DeviceMultipleReduce : public BaseOperator
|
||||
{
|
||||
static constexpr index_t NumInputDim = Rank;
|
||||
static constexpr index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1;
|
||||
|
||||
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const std::array<index_t, NumInputDim> inLengths,
|
||||
const std::array<index_t, NumInputDim> inStrides,
|
||||
const std::array<index_t, NumOutputDim> outLengths,
|
||||
const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStrides,
|
||||
const std::array<int, NumReduceDim> reduceDims,
|
||||
const std::array<const void*, NumReduction> alphas,
|
||||
const std::array<const void*, NumReduction> betas,
|
||||
const void* in_dev,
|
||||
const std::array<void*, NumReduction> out_dev_buffers,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
index_t NumReduction,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple>
|
||||
using DeviceMultipleReducePtr = std::unique_ptr<DeviceMultipleReduce<Rank,
|
||||
NumReduceDim,
|
||||
NumReduction,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,595 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataTypeTuple,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple,
|
||||
InMemoryDataOperationEnum OutMemoryDataOperation,
|
||||
bool PropagateNan,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
typename OutDstVectorSizeSeq>
|
||||
struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce<Rank,
|
||||
NumReduceDim,
|
||||
NumReduction,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
|
||||
(InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(NumReduction == OutDataTypeTuple::Size() &&
|
||||
NumReduction == InElementwiseOperationTuple::Size() &&
|
||||
NumReduction == AccElementwiseOperationTuple::Size() &&
|
||||
NumReduction == OutDstVectorSizeSeq::Size(),
|
||||
"All tuple should have the same size as the number of Reductions!");
|
||||
|
||||
static_assert(sequence_all_of(OutDstVectorSizeSeq{},
|
||||
[](auto vectorSize) {
|
||||
return (MThreadSliceSize % vectorSize == 0);
|
||||
}),
|
||||
"The OutDstVectorSize should completely divide the MThreadSliceSize!");
|
||||
|
||||
static constexpr bool CheckDataTypeTuple()
|
||||
{
|
||||
bool flag = true;
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto I) {
|
||||
using OutDataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
flag =
|
||||
flag && ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
|
||||
OutDataType>::value;
|
||||
});
|
||||
|
||||
return flag;
|
||||
};
|
||||
|
||||
static_assert(CheckDataTypeTuple(),
|
||||
"The OutDataType must support the specified OutMemoryDataOperation!");
|
||||
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
|
||||
static constexpr index_t NumInputDim = Rank;
|
||||
static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
|
||||
static constexpr bool reduceAllDim = (NumInvariantDim == 0);
|
||||
|
||||
// So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
|
||||
// later
|
||||
static constexpr bool use_multiblock =
|
||||
(OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
|
||||
|
||||
static_assert(
|
||||
ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation),
|
||||
"The reduction accumulation operation must be compatible with the OutMemoryDataOperation!");
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto GenerateOutDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<DataType*>(nullptr);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
};
|
||||
|
||||
using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::array<index_t, NumInputDim>& inLengths,
|
||||
const std::array<index_t, NumInputDim>& inStrides,
|
||||
int blkGroupSize,
|
||||
int numBlockTileIteration)
|
||||
{
|
||||
const auto tupleSrcLengths =
|
||||
generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInputDim>{});
|
||||
const auto tupleSrcStrides =
|
||||
generate_tuple([&](auto I) { return inStrides[I]; }, Number<NumInputDim>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDim)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
const auto reduceDimLengths = generate_tuple(
|
||||
[&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
|
||||
const auto invariantDimLengths =
|
||||
generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
|
||||
const auto inPad_M =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
|
||||
|
||||
auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
|
||||
in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(invariantLength, inPad_M),
|
||||
make_right_pad_transform(reduceLength, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::array<index_t, NumOutputDim>& outLengths,
|
||||
const std::array<index_t, NumOutputDim>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths =
|
||||
generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
|
||||
const auto tupleDstStrides =
|
||||
generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto outPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
|
||||
auto out_grid_desc_m_padded = transform_tensor_descriptor(
|
||||
out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(invariantLength, outPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
static auto GenerateOutGrid1dDescTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
(void)I;
|
||||
return MakeDst1dDescriptor(std::array<index_t, NumOutputDim>{},
|
||||
std::array<index_t, NumOutputDim>{});
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
};
|
||||
|
||||
using InGridDesc_M_K = decltype(MakeSrc2dDescriptor(
|
||||
std::array<index_t, NumInputDim>{}, std::array<index_t, NumInputDim>{}, 1, 1));
|
||||
using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple());
|
||||
|
||||
static auto MakeDst1dDescriptorForBufferSet(const std::array<index_t, NumOutputDim>& outLengths,
|
||||
const std::array<index_t, NumOutputDim>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths =
|
||||
generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
|
||||
const auto tupleDstStrides =
|
||||
generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto length = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto pad = math::integer_least_multiple(length, BlockSize) - length;
|
||||
|
||||
auto out_grid_desc_m_padded =
|
||||
transform_tensor_descriptor(out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(length, pad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
static auto GenerateOutGrid1dDescTuple_2()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
(void)I;
|
||||
return MakeDst1dDescriptorForBufferSet(std::array<index_t, NumOutputDim>{},
|
||||
std::array<index_t, NumOutputDim>{});
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
};
|
||||
|
||||
using OutGridDesc_M_Tuple_2 = decltype(GenerateOutGrid1dDescTuple_2());
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, NumInputDim>& inLengths,
|
||||
const std::array<index_t, NumInputDim>& inStrides,
|
||||
const std::array<index_t, NumOutputDim>& outLengths,
|
||||
const std::array<std::array<index_t, NumOutputDim>, NumReduction>& outStridesArray,
|
||||
const std::array<int, NumReduceDim>& reduceDims,
|
||||
const std::array<const void*, NumReduction>& alphas,
|
||||
const std::array<const void*, NumReduction>& betas,
|
||||
const void* in_dev,
|
||||
const std::array<void*, NumReduction>& out_dev_buffers,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple)
|
||||
: outLengths_{outLengths},
|
||||
outStridesArray_{outStridesArray},
|
||||
in_elementwise_op_tuple_{in_elementwise_op_tuple},
|
||||
acc_elementwise_op_tuple_{acc_elementwise_op_tuple}
|
||||
{
|
||||
inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
|
||||
inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
|
||||
|
||||
for(size_t i = 0; i < NumReduction; i++)
|
||||
{
|
||||
alpha_values_(i) = *static_cast<const AccDataType*>(alphas[i]);
|
||||
beta_values_(i) = *static_cast<const AccDataType*>(betas[i]);
|
||||
};
|
||||
|
||||
in_dev_ = static_cast<const InDataType*>(in_dev);
|
||||
|
||||
out_dev_buffers_ = generate_tuple(
|
||||
[&](auto iR) {
|
||||
using OutDataTypePointer =
|
||||
remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
|
||||
using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
|
||||
return static_cast<OutDataType*>(out_dev_buffers[iR]);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, NumReduceDim>(inLengths_);
|
||||
|
||||
if constexpr(use_multiblock)
|
||||
{
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
{
|
||||
int testBlkGroupSize =
|
||||
(reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
// we want the blkGroupSize be not more than 128
|
||||
if(testBlkGroupSize <= 128)
|
||||
break;
|
||||
|
||||
iterations++;
|
||||
};
|
||||
|
||||
blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
numBlockTileIteration = iterations;
|
||||
}
|
||||
else
|
||||
{
|
||||
blkGroupSize = 1;
|
||||
numBlockTileIteration =
|
||||
(reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
|
||||
};
|
||||
|
||||
in_grid_desc_m_k =
|
||||
MakeSrc2dDescriptor(inLengths_, inStrides_, blkGroupSize, numBlockTileIteration);
|
||||
|
||||
out_grid_desc_m_tuple = generate_tuple(
|
||||
[&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); },
|
||||
Number<NumReduction>{});
|
||||
|
||||
out_grid_desc_m_tuple_2 = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDst1dDescriptorForBufferSet(outLengths, outStridesArray[I]);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize * blkGroupSize;
|
||||
|
||||
gridSize_pre =
|
||||
math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize;
|
||||
}
|
||||
|
||||
std::array<index_t, NumInputDim> inLengths_;
|
||||
std::array<index_t, NumInputDim> inStrides_;
|
||||
|
||||
std::array<index_t, NumOutputDim> outLengths_;
|
||||
std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray_;
|
||||
|
||||
Array<AccDataType, NumReduction> alpha_values_;
|
||||
Array<AccDataType, NumReduction> beta_values_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
|
||||
InGridDesc_M_K in_grid_desc_m_k;
|
||||
OutGridDesc_M_Tuple out_grid_desc_m_tuple;
|
||||
OutGridDesc_M_Tuple_2 out_grid_desc_m_tuple_2;
|
||||
|
||||
InElementwiseOperationTuple in_elementwise_op_tuple_;
|
||||
AccElementwiseOperationTuple acc_elementwise_op_tuple_;
|
||||
|
||||
long_index_t invariant_total_length;
|
||||
long_index_t reduce_total_length;
|
||||
|
||||
int blkGroupSize;
|
||||
int numBlockTileIteration;
|
||||
size_t gridSize;
|
||||
|
||||
size_t gridSize_pre;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
using GridwiseMultipleReduce =
|
||||
GridwiseMultipleReduction_mk_to_m_multiblock<NumReduction,
|
||||
InDataType,
|
||||
OutDataTypePointerTuple,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M_Tuple,
|
||||
ReduceOperation,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple,
|
||||
OutMemoryDataOperation,
|
||||
PropagateNan,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSizeSeq>;
|
||||
|
||||
const auto kernel_main =
|
||||
kernel_multiple_reduce_multiblock<GridwiseMultipleReduce,
|
||||
NumReduction,
|
||||
InDataType,
|
||||
OutDataTypePointerTuple,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M_Tuple,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
if constexpr(use_multiblock)
|
||||
{
|
||||
auto identity_values = generate_tuple(
|
||||
[&](auto iR) {
|
||||
using OutDataType = remove_cvref_t<decltype(OutDataTypeTuple{}[iR])>;
|
||||
return ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
|
||||
OutMemoryDataOperation);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
const auto kernel_pre = kernel_multiple_buffer_set_value<OutGridDesc_M_Tuple_2,
|
||||
NumReduction,
|
||||
BlockSize,
|
||||
OutDataTypePointerTuple,
|
||||
OutDataTypeTuple>;
|
||||
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kernel_pre,
|
||||
dim3(arg.gridSize_pre),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.out_grid_desc_m_tuple_2,
|
||||
arg.out_dev_buffers_,
|
||||
identity_values);
|
||||
};
|
||||
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kernel_main,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.in_grid_desc_m_k,
|
||||
arg.out_grid_desc_m_tuple,
|
||||
arg.in_elementwise_op_tuple_,
|
||||
arg.acc_elementwise_op_tuple_,
|
||||
arg.blkGroupSize,
|
||||
arg.numBlockTileIteration,
|
||||
arg.alpha_values_,
|
||||
arg.in_dev_,
|
||||
arg.beta_values_,
|
||||
arg.out_dev_buffers_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(use_multiblock)
|
||||
{
|
||||
for(size_t i = 0; i < pArg->beta_values_.Size(); i++)
|
||||
if(pArg->beta_values_[i] != 0.0f)
|
||||
return (false);
|
||||
};
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(NumInvariantDim == 0)
|
||||
{
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
// To improve
|
||||
bool valid = true;
|
||||
static_for<0, NumReduction, 1>{}([&](auto I) {
|
||||
if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 &&
|
||||
OutDstVectorSizeSeq::At(I) != 1)
|
||||
valid = false;
|
||||
|
||||
if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0)
|
||||
valid = false;
|
||||
});
|
||||
|
||||
if(!valid)
|
||||
return (false);
|
||||
|
||||
if constexpr(use_multiblock)
|
||||
{
|
||||
// blkGroupSize of 1 should be handled by Blockwise path using
|
||||
// InMemoryDataOperationEnum::Set
|
||||
if(pArg->blkGroupSize == 1)
|
||||
return (false);
|
||||
|
||||
// This is very strong restriction, but needed to avoid some failure
|
||||
if(pArg->outLengths_[NumOutputDim - 1] % M_BlockTileSize != 0)
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
// cases with very small reduce_total_length should be handled by ThreadWise kernel
|
||||
if(pArg->reduce_total_length / KThreadSliceSize < 2)
|
||||
return (false);
|
||||
};
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const std::array<index_t, NumInputDim> inLengths,
|
||||
const std::array<index_t, NumInputDim> inStrides,
|
||||
const std::array<index_t, NumOutputDim> outLengths,
|
||||
const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray,
|
||||
const std::array<int, NumReduceDim> reduceDims,
|
||||
const std::array<const void*, NumReduction> alphas,
|
||||
const std::array<const void*, NumReduction> betas,
|
||||
const void* in_dev,
|
||||
const std::array<void*, NumReduction> out_dev_buffers,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStridesArray,
|
||||
reduceDims,
|
||||
alphas,
|
||||
betas,
|
||||
in_dev,
|
||||
out_dev_buffers,
|
||||
in_elementwise_op_tuple,
|
||||
acc_elementwise_op_tuple);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceMultipleReduceBlockWise<" : "DeviceMultipleReduceMultiBlock<") << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ",";
|
||||
str << "OutDstVectorSize";
|
||||
static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); });
|
||||
str << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,422 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataTypeTuple,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple,
|
||||
bool PropagateNan,
|
||||
index_t BlockSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
typename OutDstVectorSizeSeq>
|
||||
struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce<Rank,
|
||||
NumReduceDim,
|
||||
NumReduction,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
|
||||
static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
|
||||
(InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(NumReduction == OutDataTypeTuple::Size() &&
|
||||
NumReduction == InElementwiseOperationTuple::Size() &&
|
||||
NumReduction == AccElementwiseOperationTuple::Size() &&
|
||||
NumReduction == OutDstVectorSizeSeq::Size(),
|
||||
"All tuple should have the same size as the number of Reductions!");
|
||||
|
||||
static_assert(sequence_all_of(OutDstVectorSizeSeq{},
|
||||
[](auto vectorSize) {
|
||||
return (MThreadSliceSize % vectorSize == 0);
|
||||
}),
|
||||
"The OutDstVectorSize should completely divide the MThreadSliceSize!");
|
||||
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
|
||||
static constexpr index_t NumInputDim = Rank;
|
||||
static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
|
||||
static constexpr bool reduceAllDim = (NumInvariantDim == 0);
|
||||
|
||||
static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize;
|
||||
|
||||
static auto GenerateOutDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<DataType*>(nullptr);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
};
|
||||
|
||||
using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::array<index_t, NumInputDim>& inLengths,
|
||||
const std::array<index_t, NumInputDim>& inStrides)
|
||||
{
|
||||
const auto tupleSrcLengths =
|
||||
generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInputDim>{});
|
||||
const auto tupleSrcStrides =
|
||||
generate_tuple([&](auto I) { return inStrides[I]; }, Number<NumInputDim>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDim)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
const auto reduceDimLengths = generate_tuple(
|
||||
[&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
|
||||
const auto invariantDimLengths =
|
||||
generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const auto inPad_M =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
const auto inPad_K =
|
||||
math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
|
||||
|
||||
auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
|
||||
in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(invariantLength, inPad_M),
|
||||
make_right_pad_transform(reduceLength, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::array<index_t, NumOutputDim>& outLengths,
|
||||
const std::array<index_t, NumOutputDim>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths =
|
||||
generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
|
||||
const auto tupleDstStrides =
|
||||
generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto outPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
|
||||
auto out_grid_desc_m_padded = transform_tensor_descriptor(
|
||||
out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(invariantLength, outPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
static auto GenerateOutGrid1dDescTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
(void)I;
|
||||
return MakeDst1dDescriptor(std::array<index_t, NumOutputDim>{},
|
||||
std::array<index_t, NumOutputDim>{});
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
};
|
||||
|
||||
using InGridDesc_M_K = decltype(MakeSrc2dDescriptor(std::array<index_t, NumInputDim>{},
|
||||
std::array<index_t, NumInputDim>{}));
|
||||
using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple());
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, NumInputDim>& inLengths,
|
||||
const std::array<index_t, NumInputDim>& inStrides,
|
||||
const std::array<index_t, NumOutputDim>& outLengths,
|
||||
const std::array<std::array<index_t, NumOutputDim>, NumReduction>& outStridesArray,
|
||||
const std::array<int, NumReduceDim>& reduceDims,
|
||||
const std::array<const void*, NumReduction>& alphas,
|
||||
const std::array<const void*, NumReduction>& betas,
|
||||
const void* in_dev,
|
||||
const std::array<void*, NumReduction>& out_dev_buffers,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple)
|
||||
: outLengths_{outLengths},
|
||||
outStridesArray_{outStridesArray},
|
||||
in_elementwise_op_tuple_{in_elementwise_op_tuple},
|
||||
acc_elementwise_op_tuple_{acc_elementwise_op_tuple}
|
||||
{
|
||||
inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
|
||||
inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
|
||||
|
||||
for(size_t i = 0; i < NumReduction; i++)
|
||||
{
|
||||
alpha_values_(i) = *static_cast<const AccDataType*>(alphas[i]);
|
||||
beta_values_(i) = *static_cast<const AccDataType*>(betas[i]);
|
||||
};
|
||||
|
||||
in_dev_ = static_cast<const InDataType*>(in_dev);
|
||||
|
||||
out_dev_buffers_ = generate_tuple(
|
||||
[&](auto iR) {
|
||||
using OutDataTypePointer =
|
||||
remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
|
||||
using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
|
||||
return static_cast<OutDataType*>(out_dev_buffers[iR]);
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, NumReduceDim>(inLengths_);
|
||||
|
||||
in_grid_desc_m_k = MakeSrc2dDescriptor(inLengths_, inStrides_);
|
||||
|
||||
out_grid_desc_m_tuple = generate_tuple(
|
||||
[&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); },
|
||||
Number<NumReduction>{});
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
}
|
||||
|
||||
std::array<index_t, NumInputDim> inLengths_;
|
||||
std::array<index_t, NumInputDim> inStrides_;
|
||||
|
||||
std::array<index_t, NumOutputDim> outLengths_;
|
||||
std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray_;
|
||||
|
||||
Array<AccDataType, NumReduction> alpha_values_;
|
||||
Array<AccDataType, NumReduction> beta_values_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
|
||||
InGridDesc_M_K in_grid_desc_m_k;
|
||||
OutGridDesc_M_Tuple out_grid_desc_m_tuple;
|
||||
|
||||
InElementwiseOperationTuple in_elementwise_op_tuple_;
|
||||
AccElementwiseOperationTuple acc_elementwise_op_tuple_;
|
||||
|
||||
long_index_t invariant_total_length;
|
||||
long_index_t reduce_total_length;
|
||||
|
||||
size_t gridSize;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
using GridwiseMultipleReduce =
|
||||
GridwiseMultipleReduction_mk_to_m_threadwise<NumReduction,
|
||||
InDataType,
|
||||
OutDataTypePointerTuple,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M_Tuple,
|
||||
ReduceOperation,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
PropagateNan,
|
||||
BlockSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSizeSeq>;
|
||||
|
||||
const auto kernel_main =
|
||||
kernel_multiple_reduce_threadwise<GridwiseMultipleReduce,
|
||||
NumReduction,
|
||||
InDataType,
|
||||
OutDataTypePointerTuple,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M_Tuple,
|
||||
InElementwiseOperationTuple,
|
||||
AccElementwiseOperationTuple>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kernel_main,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.in_grid_desc_m_k,
|
||||
arg.out_grid_desc_m_tuple,
|
||||
arg.in_elementwise_op_tuple_,
|
||||
arg.acc_elementwise_op_tuple_,
|
||||
arg.alpha_values_,
|
||||
arg.in_dev_,
|
||||
arg.beta_values_,
|
||||
arg.out_dev_buffers_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(NumInvariantDim == 0)
|
||||
{
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
|
||||
// To improve
|
||||
bool valid = true;
|
||||
static_for<0, NumReduction, 1>{}([&](auto I) {
|
||||
if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 &&
|
||||
OutDstVectorSizeSeq::At(I) != 1)
|
||||
valid = false;
|
||||
|
||||
if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0)
|
||||
valid = false;
|
||||
});
|
||||
|
||||
if(!valid)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const std::array<index_t, NumInputDim> inLengths,
|
||||
const std::array<index_t, NumInputDim> inStrides,
|
||||
const std::array<index_t, NumOutputDim> outLengths,
|
||||
const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray,
|
||||
const std::array<int, NumReduceDim> reduceDims,
|
||||
const std::array<const void*, NumReduction> alphas,
|
||||
const std::array<const void*, NumReduction> betas,
|
||||
const void* in_dev,
|
||||
const std::array<void*, NumReduction> out_dev_buffers,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStridesArray,
|
||||
reduceDims,
|
||||
alphas,
|
||||
betas,
|
||||
in_dev,
|
||||
out_dev_buffers,
|
||||
in_elementwise_op_tuple,
|
||||
acc_elementwise_op_tuple);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceMultipleReduceThreadwise<" << BlockSize << ",";
|
||||
str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << 1 << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ",";
|
||||
str << "OutDstVectorSize";
|
||||
static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); });
|
||||
str << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -35,6 +35,25 @@ std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>&
|
||||
return std::make_pair(invariant_total_length, reduce_total_length);
|
||||
};
|
||||
|
||||
template <index_t Rank, int NumReduceDim>
|
||||
std::pair<long_index_t, long_index_t> get_2d_lengths(const std::array<index_t, Rank>& inLengths)
|
||||
{
|
||||
static_assert(Rank <= 6, "bigger Rank size not supported!");
|
||||
|
||||
long_index_t invariant_total_length = 1;
|
||||
long_index_t reduce_total_length = 1;
|
||||
|
||||
constexpr int NumInvariantDim = Rank - NumReduceDim;
|
||||
|
||||
for(int i = NumInvariantDim; i < Rank; i++)
|
||||
reduce_total_length *= inLengths[i];
|
||||
|
||||
for(int i = 0; i < NumInvariantDim; i++)
|
||||
invariant_total_length *= inLengths[i];
|
||||
|
||||
return std::make_pair(invariant_total_length, reduce_total_length);
|
||||
};
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
auto make_tuple_from_array_and_index_seq(const std::vector<index_t>& lengths, Sequence<Ns...>)
|
||||
@@ -85,6 +104,39 @@ std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origL
|
||||
return newLengthsStrides;
|
||||
};
|
||||
|
||||
template <index_t Rank, index_t NumReduceDim>
|
||||
std::array<index_t, Rank>
|
||||
shuffle_tensor_dimensions(const std::array<index_t, Rank>& origLengthsStrides,
|
||||
const std::array<int, NumReduceDim>& reduceDims)
|
||||
{
|
||||
std::array<index_t, Rank> newLengthsStrides;
|
||||
|
||||
int reduceFlag = 0;
|
||||
|
||||
// flag the bits for the reduceDims
|
||||
for(int i = 0; i < NumReduceDim; i++)
|
||||
{
|
||||
reduceFlag |= 1 << reduceDims[i];
|
||||
};
|
||||
|
||||
// collect invariant dimensions
|
||||
int pos = 0;
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) == 0)
|
||||
{
|
||||
newLengthsStrides[pos++] = origLengthsStrides[i];
|
||||
};
|
||||
|
||||
// collect reduce dimensions
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) > 0)
|
||||
{
|
||||
newLengthsStrides[pos++] = origLengthsStrides[i];
|
||||
};
|
||||
|
||||
return newLengthsStrides;
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,183 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename ElementwiseFunctor,
|
||||
index_t Dim,
|
||||
index_t ScalarPerVector>
|
||||
struct DeviceUnaryElementwise : public BaseOperator
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
template <typename Desc_M0>
|
||||
static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
|
||||
{
|
||||
const auto m0 = desc_m0.GetLength(I0);
|
||||
const index_t loop_step = gridSize * blockSize * ScalarPerVector;
|
||||
const auto pad = math::integer_least_multiple(m0, loop_step) - m0;
|
||||
const auto desc_m0_pad =
|
||||
transform_tensor_descriptor(desc_m0,
|
||||
make_tuple(make_right_pad_transform(m0, pad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return desc_m0_pad;
|
||||
}
|
||||
|
||||
static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
|
||||
const std::vector<index_t>& stride,
|
||||
index_t gridSize,
|
||||
index_t blockSize)
|
||||
{
|
||||
auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
|
||||
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
|
||||
|
||||
// nd desc - [s0, s1, s2, ...]
|
||||
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
|
||||
|
||||
// merge nd to 1d desc - [s0 * s1 * ...]
|
||||
if constexpr(Dim > 1)
|
||||
{
|
||||
const auto desc_m0 = transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform(tupleOfShape)),
|
||||
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
|
||||
}
|
||||
else
|
||||
return PadDescriptor_M0_1d(desc, gridSize, blockSize);
|
||||
}
|
||||
|
||||
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
|
||||
using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
|
||||
BDataType,
|
||||
GridDesc_M0,
|
||||
ElementwiseFunctor,
|
||||
ScalarPerVector>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const ADataType* p_a,
|
||||
BDataType* p_b,
|
||||
const std::vector<index_t>& shape,
|
||||
const std::vector<index_t>& stride_a,
|
||||
const std::vector<index_t>& stride_b,
|
||||
ElementwiseFunctor functor)
|
||||
: p_a_(p_a),
|
||||
p_b_(p_b),
|
||||
shape_(shape),
|
||||
functor_(functor),
|
||||
blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
|
||||
{
|
||||
index_t tensor_size =
|
||||
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
|
||||
gridSize_ = GridwiseUEltwise::CalculateGridSize(tensor_size);
|
||||
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
|
||||
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
|
||||
}
|
||||
|
||||
const ADataType* p_a_;
|
||||
BDataType* p_b_;
|
||||
std::vector<int> shape_;
|
||||
GridDesc_M0 a_grid_desc_m0_;
|
||||
GridDesc_M0 b_grid_desc_m0_;
|
||||
ElementwiseFunctor functor_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
|
||||
ADataType,
|
||||
BDataType,
|
||||
GridDesc_M0,
|
||||
ElementwiseFunctor>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.p_a_,
|
||||
arg.p_b_,
|
||||
arg.a_grid_desc_m0_,
|
||||
arg.b_grid_desc_m0_,
|
||||
arg.functor_);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg == nullptr)
|
||||
return false;
|
||||
|
||||
if(pArg->shape_.back() % ScalarPerVector != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
|
||||
void* p_b,
|
||||
std::vector<index_t> shape,
|
||||
std::vector<index_t> stride_a,
|
||||
std::vector<index_t> stride_b,
|
||||
ElementwiseFunctor functor)
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
|
||||
static_cast<BDataType*>(p_b),
|
||||
shape,
|
||||
stride_a,
|
||||
stride_b,
|
||||
functor);
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceBinaryElementwise"
|
||||
<< "<"
|
||||
<< "ScalarPerVector = " << ScalarPerVector
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -198,17 +198,44 @@ struct Normalize
|
||||
// FIXME: is double absolutely necessary?
|
||||
Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr void operator()(
|
||||
T& y, const T& x, const T& mean, const T& mean_square, const T& gamma, const T& beta) const;
|
||||
template <typename T1, typename T2, typename T3>
|
||||
__host__ __device__ constexpr void operator()(T1& y,
|
||||
const T1& x,
|
||||
const T2& mean,
|
||||
const T2& mean_square,
|
||||
const T3& gamma,
|
||||
const T3& beta) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<float>(float& y,
|
||||
const float& x,
|
||||
const float& mean,
|
||||
const float& mean_square,
|
||||
const float& gamma,
|
||||
const float& beta) const
|
||||
__host__ __device__ constexpr void operator()<half_t, float, half_t>(half_t& y,
|
||||
const half_t& x,
|
||||
const float& mean,
|
||||
const float& mean_square,
|
||||
const half_t& gamma,
|
||||
const half_t& beta) const
|
||||
{
|
||||
using ck::math::sqrt;
|
||||
|
||||
float variance = mean_square - (mean * mean);
|
||||
|
||||
float tmp_x = type_convert<float>(x);
|
||||
float tmp_gamma = type_convert<float>(gamma);
|
||||
float tmp_beta = type_convert<float>(beta);
|
||||
|
||||
float tmp_y =
|
||||
((tmp_x - mean) / sqrt(variance + type_convert<float>(epsilon_))) * tmp_gamma +
|
||||
tmp_beta;
|
||||
|
||||
y = type_convert<half_t>(tmp_y);
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<float, float, float>(float& y,
|
||||
const float& x,
|
||||
const float& mean,
|
||||
const float& mean_square,
|
||||
const float& gamma,
|
||||
const float& beta) const
|
||||
{
|
||||
using ck::math::sqrt;
|
||||
|
||||
@@ -217,12 +244,12 @@ struct Normalize
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<double>(double& y,
|
||||
const double& x,
|
||||
const double& mean,
|
||||
const double& mean_square,
|
||||
const double& gamma,
|
||||
const double& beta) const
|
||||
__host__ __device__ constexpr void operator()<double, double, double>(double& y,
|
||||
const double& x,
|
||||
const double& mean,
|
||||
const double& mean_square,
|
||||
const double& gamma,
|
||||
const double& beta) const
|
||||
{
|
||||
using ck::math::sqrt;
|
||||
|
||||
|
||||
@@ -0,0 +1,321 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/reduction_common.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/utility/reduction_functions_accumulate.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseMultipleReduction,
|
||||
index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename AccDataType,
|
||||
typename InGridDesc_M_K,
|
||||
typename OutGridDesc_M_Tuple,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple>
|
||||
__global__ void
|
||||
kernel_multiple_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
|
||||
const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple,
|
||||
index_t block_group_size,
|
||||
index_t num_k_block_tile_iteration,
|
||||
Array<AccDataType, NumReduction> alpha_values,
|
||||
const InDataType* const __restrict__ p_in_value_global,
|
||||
Array<AccDataType, NumReduction> beta_values,
|
||||
OutDataTypePointerTuple p_out_value_global_tuple)
|
||||
{
|
||||
GridwiseMultipleReduction::Run(in_grid_desc_m_k,
|
||||
out_grid_desc_m_tuple,
|
||||
in_elementwise_op_tuple,
|
||||
acc_elementwise_op_tuple,
|
||||
block_group_size,
|
||||
num_k_block_tile_iteration,
|
||||
alpha_values,
|
||||
p_in_value_global,
|
||||
beta_values,
|
||||
p_out_value_global_tuple);
|
||||
};
|
||||
|
||||
template <index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename AccDataType,
|
||||
typename InGridDesc_M_K,
|
||||
typename OutGridDesc_M_Tuple,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple,
|
||||
InMemoryDataOperationEnum OutMemoryDataOperation,
|
||||
bool PropagateNan,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
typename OutDstVectorSizeSeq>
|
||||
struct GridwiseMultipleReduction_mk_to_m_multiblock
|
||||
{
|
||||
static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
|
||||
(InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(NumReduction == OutDataTypePointerTuple::Size() &&
|
||||
NumReduction == OutGridDesc_M_Tuple::Size() &&
|
||||
NumReduction == OutDstVectorSizeSeq::Size() &&
|
||||
NumReduction == InElementwiseOperationTuple::Size() &&
|
||||
NumReduction == AccElementwiseOperationTuple::Size(),
|
||||
"All tuple should have the same size as the number of Reductions!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
|
||||
|
||||
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadClusterArrangeOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
static constexpr auto thread_cluster_desc =
|
||||
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder,
|
||||
ReduceOperation,
|
||||
PropagateNan>;
|
||||
|
||||
using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
|
||||
ThreadReduceSrcDesc_M_K,
|
||||
ThreadReduceDstDesc_M,
|
||||
ReduceOperation,
|
||||
PropagateNan>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
|
||||
|
||||
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
|
||||
const OutGridDesc_M_Tuple& out_grid_desc_m_tuple,
|
||||
const InElementwiseOperationTuple& in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple& acc_elementwise_op_tuple,
|
||||
index_t block_group_size,
|
||||
index_t num_k_block_tile_iteration,
|
||||
Array<AccDataType, NumReduction> alpha_values,
|
||||
const InDataType* const __restrict__ p_in_value_global,
|
||||
Array<AccDataType, NumReduction> beta_values,
|
||||
OutDataTypePointerTuple p_out_value_global_tuple)
|
||||
{
|
||||
const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
|
||||
|
||||
// LDS, reused by all reductions
|
||||
__shared__ AccDataType p_reduce_work_buffer[BlockSize];
|
||||
|
||||
const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_value_global,
|
||||
in_grid_desc_m_k.GetElementSpaceSize(),
|
||||
ReduceOperation::template GetIdentityValue<InDataType>());
|
||||
auto out_global_val_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
auto reduce_work_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
in_thread_buf;
|
||||
|
||||
auto in_thread_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
(void)iR;
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
AccDataType,
|
||||
MThreadSliceSize * KThreadSliceSize,
|
||||
true>{};
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
auto accu_value_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
(void)iR;
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>{};
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
static_for<0, MThreadSliceSize, 1>{}(
|
||||
[&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; });
|
||||
});
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / block_group_size;
|
||||
const index_t block_local_id = block_global_id % block_group_size;
|
||||
|
||||
const auto thread_cluster_idx =
|
||||
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
|
||||
|
||||
const auto thread_m_cluster_id = thread_cluster_idx[I0];
|
||||
const auto thread_k_cluster_id = thread_cluster_idx[I1];
|
||||
|
||||
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
|
||||
|
||||
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
decltype(thread_buffer_desc),
|
||||
ThreadBufferLengths,
|
||||
ThreadBufferDimAccessOrder,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
1,
|
||||
false>(
|
||||
in_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id * reduceSizePerBlock +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
|
||||
|
||||
index_t reducedTiles = 0;
|
||||
do
|
||||
{
|
||||
threadwise_src_load.Run(in_grid_desc_m_k,
|
||||
in_global_val_buf,
|
||||
thread_buffer_desc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_buf);
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
// do element-wise pre-reduction operation
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
|
||||
in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number<offset>{}),
|
||||
in_thread_buf(Number<offset>{}));
|
||||
});
|
||||
});
|
||||
|
||||
ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR));
|
||||
});
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
|
||||
|
||||
reducedTiles++;
|
||||
} while(reducedTiles < num_k_block_tile_iteration);
|
||||
|
||||
constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
using OutDataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
|
||||
using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf_tuple(iR)(I));
|
||||
});
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I),
|
||||
accu_value_buf_tuple(iR)(I));
|
||||
|
||||
accu_value_buf_tuple(iR)(I) *= alpha_values[iR];
|
||||
}
|
||||
});
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
if(block_group_size == 0 && !float_equal_zero{}(beta_values[iR]))
|
||||
{
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
|
||||
priorDstValueBuf;
|
||||
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<OutDataType,
|
||||
OutDataType,
|
||||
decltype(out_grid_desc_m_tuple[iR]),
|
||||
decltype(reduced_data_desc),
|
||||
Sequence<MThreadSliceSize>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
OutDstVectorSizeSeq::At(iR),
|
||||
1,
|
||||
false>(
|
||||
out_grid_desc_m_tuple[iR],
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
threadwise_dst_load.Run(out_grid_desc_m_tuple[iR],
|
||||
out_global_val_buf_tuple(iR),
|
||||
reduced_data_desc,
|
||||
make_tuple(I0),
|
||||
priorDstValueBuf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
accu_value_buf_tuple(iR)(I) +=
|
||||
type_convert<AccDataType>(priorDstValueBuf[I]) * beta_values[iR];
|
||||
});
|
||||
};
|
||||
|
||||
auto threadwise_dst_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
OutDataType,
|
||||
decltype(reduced_data_desc),
|
||||
decltype(out_grid_desc_m_tuple[iR]),
|
||||
PassThroughOp,
|
||||
Sequence<MThreadSliceSize>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
OutDstVectorSizeSeq::At(iR),
|
||||
OutMemoryDataOperation,
|
||||
1,
|
||||
true>(
|
||||
out_grid_desc_m_tuple[iR],
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
threadwise_dst_store.Run(reduced_data_desc,
|
||||
make_tuple(I0),
|
||||
accu_value_buf_tuple[iR],
|
||||
out_grid_desc_m_tuple[iR],
|
||||
out_global_val_buf_tuple(iR));
|
||||
};
|
||||
});
|
||||
};
|
||||
}; // namespace ck
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,264 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/reduction_common.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/utility/reduction_functions_accumulate.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseMultipleReduction,
|
||||
index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename AccDataType,
|
||||
typename InGridDesc_M_K,
|
||||
typename OutGridDesc_M_Tuple,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple>
|
||||
__global__ void
|
||||
kernel_multiple_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
|
||||
const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
|
||||
const InElementwiseOperationTuple in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple acc_elementwise_op_tuple,
|
||||
Array<AccDataType, NumReduction> alpha_values,
|
||||
const InDataType* const __restrict__ p_in_value_global,
|
||||
Array<AccDataType, NumReduction> beta_values,
|
||||
OutDataTypePointerTuple p_out_value_global_tuple)
|
||||
{
|
||||
GridwiseMultipleReduction::Run(in_grid_desc_m_k,
|
||||
out_grid_desc_m_tuple,
|
||||
in_elementwise_op_tuple,
|
||||
acc_elementwise_op_tuple,
|
||||
alpha_values,
|
||||
p_in_value_global,
|
||||
beta_values,
|
||||
p_out_value_global_tuple);
|
||||
};
|
||||
|
||||
template <index_t NumReduction,
|
||||
typename InDataType,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename AccDataType,
|
||||
typename InGridDesc_M_K,
|
||||
typename OutGridDesc_M_Tuple,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperationTuple,
|
||||
typename AccElementwiseOperationTuple,
|
||||
InMemoryDataOperationEnum OutMemoryDataOperation,
|
||||
bool PropagateNan,
|
||||
index_t BlockSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
typename OutDstVectorSizeSeq>
|
||||
struct GridwiseMultipleReduction_mk_to_m_threadwise
|
||||
{
|
||||
static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
|
||||
(InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(NumReduction == OutDataTypePointerTuple::Size() &&
|
||||
NumReduction == OutGridDesc_M_Tuple::Size() &&
|
||||
NumReduction == OutDstVectorSizeSeq::Size() &&
|
||||
NumReduction == InElementwiseOperationTuple::Size() &&
|
||||
NumReduction == AccElementwiseOperationTuple::Size(),
|
||||
"All tuple should have the same size as the number of Reductions!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
|
||||
ThreadReduceSrcDesc_M_K,
|
||||
ThreadReduceDstDesc_M,
|
||||
ReduceOperation,
|
||||
PropagateNan>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
|
||||
|
||||
__device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
|
||||
const OutGridDesc_M_Tuple& out_grid_desc_m_tuple,
|
||||
const InElementwiseOperationTuple& in_elementwise_op_tuple,
|
||||
const AccElementwiseOperationTuple& acc_elementwise_op_tuple,
|
||||
Array<AccDataType, NumReduction> alpha_values,
|
||||
const InDataType* const __restrict__ p_in_value_global,
|
||||
Array<AccDataType, NumReduction> beta_values,
|
||||
OutDataTypePointerTuple p_out_value_global_tuple)
|
||||
{
|
||||
const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
|
||||
|
||||
const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_value_global,
|
||||
in_grid_desc_m_k.GetElementSpaceSize(),
|
||||
ReduceOperation::template GetIdentityValue<InDataType>());
|
||||
auto out_global_val_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
in_thread_buf;
|
||||
|
||||
auto in_thread_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
(void)iR;
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
AccDataType,
|
||||
MThreadSliceSize * KThreadSliceSize,
|
||||
true>{};
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
auto accu_value_buf_tuple = generate_tuple(
|
||||
[&](auto iR) {
|
||||
(void)iR;
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>{};
|
||||
},
|
||||
Number<NumReduction>{});
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
static_for<0, MThreadSliceSize, 1>{}(
|
||||
[&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; });
|
||||
});
|
||||
|
||||
const index_t thread_global_1d_id = get_thread_global_1d_id();
|
||||
|
||||
const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
using ThreadBufferLengths = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
|
||||
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
decltype(thread_buffer_desc),
|
||||
ThreadBufferLengths,
|
||||
ThreadBufferDimAccessOrder,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
1,
|
||||
false>(
|
||||
in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
|
||||
|
||||
constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
|
||||
|
||||
index_t reducedLength = 0;
|
||||
do
|
||||
{
|
||||
threadwise_src_load.Run(in_grid_desc_m_k,
|
||||
in_global_val_buf,
|
||||
thread_buffer_desc,
|
||||
make_tuple(I0, I0),
|
||||
in_thread_buf);
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
// do element-wise pre-reduction operation
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
|
||||
in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number<offset>{}),
|
||||
in_thread_buf(Number<offset>{}));
|
||||
});
|
||||
});
|
||||
|
||||
ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR));
|
||||
});
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
|
||||
|
||||
reducedLength += KThreadSliceSize;
|
||||
} while(reducedLength < toReduceLength);
|
||||
|
||||
constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
|
||||
|
||||
static_for<0, NumReduction, 1>{}([&](auto iR) {
|
||||
using OutDataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
|
||||
using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I),
|
||||
accu_value_buf_tuple(iR)(I));
|
||||
|
||||
accu_value_buf_tuple(iR)(I) *= alpha_values[iR];
|
||||
});
|
||||
|
||||
if(!float_equal_zero{}(beta_values[iR]))
|
||||
{
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
|
||||
priorDstValueBuf;
|
||||
|
||||
auto threadwise_dst_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<OutDataType,
|
||||
OutDataType,
|
||||
decltype(out_grid_desc_m_tuple[iR]),
|
||||
decltype(reduced_data_desc),
|
||||
Sequence<MThreadSliceSize>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
OutDstVectorSizeSeq::At(iR),
|
||||
1,
|
||||
false>(
|
||||
out_grid_desc_m_tuple[iR],
|
||||
make_multi_index(thread_global_1d_id * MThreadSliceSize));
|
||||
|
||||
threadwise_dst_load.Run(out_grid_desc_m_tuple[iR],
|
||||
out_global_val_buf_tuple(iR),
|
||||
reduced_data_desc,
|
||||
make_tuple(I0),
|
||||
priorDstValueBuf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
accu_value_buf_tuple(iR)(I) +=
|
||||
type_convert<AccDataType>(priorDstValueBuf[I]) * beta_values[iR];
|
||||
});
|
||||
};
|
||||
|
||||
auto threadwise_dst_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
OutDataType,
|
||||
decltype(reduced_data_desc),
|
||||
decltype(out_grid_desc_m_tuple[iR]),
|
||||
PassThroughOp,
|
||||
Sequence<MThreadSliceSize>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
OutDstVectorSizeSeq::At(iR),
|
||||
OutMemoryDataOperation,
|
||||
1,
|
||||
true>(
|
||||
out_grid_desc_m_tuple[iR],
|
||||
make_multi_index(thread_global_1d_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
threadwise_dst_store.Run(reduced_data_desc,
|
||||
make_tuple(I0),
|
||||
accu_value_buf_tuple[iR],
|
||||
out_grid_desc_m_tuple[iR],
|
||||
out_global_val_buf_tuple(iR));
|
||||
});
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,254 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename Gridwise5AryEltwise,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename DDataType,
|
||||
typename EDataType,
|
||||
typename FDataType,
|
||||
typename AGridDesc_M,
|
||||
typename BGridDesc_M,
|
||||
typename CGridDesc_M,
|
||||
typename DGridDesc_M,
|
||||
typename EGridDesc_M,
|
||||
typename FGridDesc_M,
|
||||
typename ElementwiseFunctor>
|
||||
__global__ void kernel_5ary_elementwise_1d(const ADataType* __restrict__ p_a_global,
|
||||
const BDataType* __restrict__ p_b_global,
|
||||
const CDataType* __restrict__ p_c_global,
|
||||
const DDataType* __restrict__ p_d_global,
|
||||
const EDataType* __restrict__ p_e_global,
|
||||
FDataType* __restrict__ p_f_global,
|
||||
const AGridDesc_M a_grid_desc_m,
|
||||
const BGridDesc_M b_grid_desc_m,
|
||||
const CGridDesc_M c_grid_desc_m,
|
||||
const DGridDesc_M d_grid_desc_m,
|
||||
const EGridDesc_M e_grid_desc_m,
|
||||
const FGridDesc_M f_grid_desc_m,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
Gridwise5AryEltwise::Run(p_a_global,
|
||||
p_b_global,
|
||||
p_c_global,
|
||||
p_d_global,
|
||||
p_e_global,
|
||||
p_f_global,
|
||||
a_grid_desc_m,
|
||||
b_grid_desc_m,
|
||||
c_grid_desc_m,
|
||||
d_grid_desc_m,
|
||||
e_grid_desc_m,
|
||||
f_grid_desc_m,
|
||||
functor);
|
||||
}
|
||||
|
||||
// TODO - implement n-ary Elemenetwise_1D, tuple of inputs and tuple of outputs
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename DDataType,
|
||||
typename EDataType,
|
||||
typename FDataType,
|
||||
typename ComputeDataType,
|
||||
typename AGridDesc_M,
|
||||
typename BGridDesc_M,
|
||||
typename CGridDesc_M,
|
||||
typename DGridDesc_M,
|
||||
typename EGridDesc_M,
|
||||
typename FGridDesc_M,
|
||||
typename ElementwiseFunctor,
|
||||
index_t MPerThread,
|
||||
index_t AScalarPerVector,
|
||||
index_t BScalarPerVector,
|
||||
index_t CScalarPerVector,
|
||||
index_t DScalarPerVector,
|
||||
index_t EScalarPerVector,
|
||||
index_t FScalarPerVector>
|
||||
struct Gridwise5AryElementwise_1D
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto thread_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
|
||||
|
||||
using PassThrough = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static __device__ auto CalculateElementwiseIndex()
|
||||
{
|
||||
const index_t global_thread_id = get_thread_global_1d_id();
|
||||
return make_multi_index(global_thread_id * MPerThread);
|
||||
}
|
||||
|
||||
__device__ static void Run(const ADataType* __restrict__ p_a_global,
|
||||
const BDataType* __restrict__ p_b_global,
|
||||
const CDataType* __restrict__ p_c_global,
|
||||
const DDataType* __restrict__ p_d_global,
|
||||
const EDataType* __restrict__ p_e_global,
|
||||
FDataType* __restrict__ p_f_global,
|
||||
const AGridDesc_M a_grid_desc_m,
|
||||
const BGridDesc_M b_grid_desc_m,
|
||||
const CGridDesc_M c_grid_desc_m,
|
||||
const DGridDesc_M d_grid_desc_m,
|
||||
const EGridDesc_M e_grid_desc_m,
|
||||
const FGridDesc_M f_grid_desc_m,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_a_global, a_grid_desc_m.GetElementSpaceSize());
|
||||
const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_b_global, b_grid_desc_m.GetElementSpaceSize());
|
||||
const auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_c_global, c_grid_desc_m.GetElementSpaceSize());
|
||||
const auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_d_global, d_grid_desc_m.GetElementSpaceSize());
|
||||
const auto e_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_e_global, e_grid_desc_m.GetElementSpaceSize());
|
||||
auto f_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_f_global, f_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> a_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> b_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> c_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> d_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> e_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> f_thread_buf;
|
||||
|
||||
const auto thread_store_global_offset = CalculateElementwiseIndex();
|
||||
|
||||
auto a_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<ADataType,
|
||||
ComputeDataType,
|
||||
AGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
AScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{a_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto b_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<BDataType,
|
||||
ComputeDataType,
|
||||
BGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
BScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{b_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto c_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<CDataType,
|
||||
ComputeDataType,
|
||||
CGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
CScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{c_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto d_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<DDataType,
|
||||
ComputeDataType,
|
||||
DGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
DScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{d_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto e_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<EDataType,
|
||||
ComputeDataType,
|
||||
EGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
EScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{e_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto f_global_write =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
FDataType,
|
||||
decltype(thread_desc_m),
|
||||
FGridDesc_M,
|
||||
PassThrough,
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // DstVectorDim
|
||||
FScalarPerVector, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1, // DstScalarStrideInVector
|
||||
false>{
|
||||
f_grid_desc_m, thread_store_global_offset, PassThrough{}};
|
||||
|
||||
const index_t blockSize = get_block_size();
|
||||
const index_t blockPerGrid = get_grid_size();
|
||||
const auto M = c_grid_desc_m.GetLength(I0);
|
||||
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
|
||||
const auto loop_step_index = make_multi_index(loop_step);
|
||||
|
||||
index_t num_iter = M / (loop_step);
|
||||
do
|
||||
{
|
||||
// read and process MPerThread elements
|
||||
a_global_load.Run(
|
||||
a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf);
|
||||
|
||||
b_global_load.Run(
|
||||
b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf);
|
||||
|
||||
c_global_load.Run(
|
||||
c_grid_desc_m, c_global_buf, thread_desc_m, make_tuple(I0), c_thread_buf);
|
||||
|
||||
d_global_load.Run(
|
||||
d_grid_desc_m, d_global_buf, thread_desc_m, make_tuple(I0), d_thread_buf);
|
||||
|
||||
e_global_load.Run(
|
||||
e_grid_desc_m, e_global_buf, thread_desc_m, make_tuple(I0), e_thread_buf);
|
||||
|
||||
static_for<0, MPerThread, 1>{}([&](auto m) {
|
||||
constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m));
|
||||
functor(f_thread_buf(Number<offset>{}),
|
||||
a_thread_buf(Number<offset>{}),
|
||||
b_thread_buf(Number<offset>{}),
|
||||
c_thread_buf(Number<offset>{}),
|
||||
d_thread_buf(Number<offset>{}),
|
||||
e_thread_buf(Number<offset>{}));
|
||||
});
|
||||
|
||||
f_global_write.Run(thread_desc_m,
|
||||
make_tuple(I0), // SrcSliceOriginIdx
|
||||
f_thread_buf,
|
||||
f_grid_desc_m,
|
||||
f_global_buf);
|
||||
|
||||
a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index);
|
||||
b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index);
|
||||
c_global_load.MoveSrcSliceWindow(c_grid_desc_m, loop_step_index);
|
||||
d_global_load.MoveSrcSliceWindow(d_grid_desc_m, loop_step_index);
|
||||
e_global_load.MoveSrcSliceWindow(e_grid_desc_m, loop_step_index);
|
||||
f_global_write.MoveDstSliceWindow(f_grid_desc_m, loop_step_index);
|
||||
} while(--num_iter);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,155 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseBinEltwise,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename AGridDesc_M,
|
||||
typename BGridDesc_M,
|
||||
typename CGridDesc_M,
|
||||
typename ElementwiseFunctor>
|
||||
__global__ void kernel_binary_elementwise_1d(const ADataType* __restrict__ p_a_global,
|
||||
const BDataType* __restrict__ p_b_global,
|
||||
CDataType* __restrict__ p_c_global,
|
||||
const AGridDesc_M a_grid_desc_m,
|
||||
const BGridDesc_M b_grid_desc_m,
|
||||
const CGridDesc_M c_grid_desc_m,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
GridwiseBinEltwise::Run(
|
||||
p_a_global, p_b_global, p_c_global, a_grid_desc_m, b_grid_desc_m, c_grid_desc_m, functor);
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename ComputeDataType,
|
||||
typename AGridDesc_M,
|
||||
typename BGridDesc_M,
|
||||
typename CGridDesc_M,
|
||||
typename ElementwiseFunctor,
|
||||
index_t MPerThread,
|
||||
index_t AScalarPerVector,
|
||||
index_t BScalarPerVector,
|
||||
index_t CScalarPerVector>
|
||||
struct GridwiseBinaryElementwise_1D
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto thread_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
|
||||
|
||||
using PassThrough = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static __device__ auto CalculateElementwiseIndex()
|
||||
{
|
||||
const index_t global_thread_id = get_thread_global_1d_id();
|
||||
return make_multi_index(global_thread_id * MPerThread);
|
||||
}
|
||||
|
||||
__device__ static void Run(const ADataType* __restrict__ p_a_global,
|
||||
const BDataType* __restrict__ p_b_global,
|
||||
CDataType* __restrict__ p_c_global,
|
||||
const AGridDesc_M a_grid_desc_m,
|
||||
const BGridDesc_M b_grid_desc_m,
|
||||
const CGridDesc_M c_grid_desc_m,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_a_global, a_grid_desc_m.GetElementSpaceSize());
|
||||
const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_b_global, b_grid_desc_m.GetElementSpaceSize());
|
||||
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_c_global, c_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> a_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> b_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> c_thread_buf;
|
||||
|
||||
const auto thread_store_global_offset = CalculateElementwiseIndex();
|
||||
|
||||
auto a_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<ADataType,
|
||||
ComputeDataType,
|
||||
AGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
AScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{a_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto b_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<BDataType,
|
||||
ComputeDataType,
|
||||
BGridDesc_M,
|
||||
decltype(thread_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
BScalarPerVector, // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{b_grid_desc_m, thread_store_global_offset};
|
||||
|
||||
auto c_global_write =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
CDataType,
|
||||
decltype(thread_desc_m),
|
||||
CGridDesc_M,
|
||||
PassThrough,
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // DstVectorDim
|
||||
CScalarPerVector, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1, // DstScalarStrideInVector
|
||||
false>{
|
||||
c_grid_desc_m, thread_store_global_offset, PassThrough{}};
|
||||
|
||||
const index_t blockSize = get_block_size();
|
||||
const index_t blockPerGrid = get_grid_size();
|
||||
const auto M = c_grid_desc_m.GetLength(I0);
|
||||
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
|
||||
const auto loop_step_index = make_multi_index(loop_step);
|
||||
|
||||
index_t num_iter = M / (loop_step);
|
||||
do
|
||||
{
|
||||
// read and process MPerThread elements
|
||||
a_global_load.Run(
|
||||
a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf);
|
||||
|
||||
b_global_load.Run(
|
||||
b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf);
|
||||
|
||||
static_for<0, MPerThread, 1>{}([&](auto m) {
|
||||
constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m));
|
||||
functor(c_thread_buf(Number<offset>{}),
|
||||
a_thread_buf(Number<offset>{}),
|
||||
b_thread_buf(Number<offset>{}));
|
||||
});
|
||||
|
||||
c_global_write.Run(thread_desc_m,
|
||||
make_tuple(I0), // SrcSliceOriginIdx
|
||||
c_thread_buf,
|
||||
c_grid_desc_m,
|
||||
c_global_buf);
|
||||
|
||||
a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index);
|
||||
b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index);
|
||||
c_global_write.MoveDstSliceWindow(c_grid_desc_m, loop_step_index);
|
||||
} while(--num_iter);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
191
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
Normal file
191
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
Normal file
@@ -0,0 +1,191 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseElementwise1dFunctor,
|
||||
typename InGrid1dDescTuple,
|
||||
typename OutGrid1dDescTuple,
|
||||
typename InDataTypePointerTuple,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename ElementwiseOperation>
|
||||
__global__ void kernel_elementwise_1d(const InGrid1dDescTuple in_grid_1d_desc_tuple,
|
||||
const OutGrid1dDescTuple out_grid_1d_desc_tuple,
|
||||
const InDataTypePointerTuple p_in_global_tuple,
|
||||
const OutDataTypePointerTuple p_out_global_tuple,
|
||||
const ElementwiseOperation elementwise_op)
|
||||
{
|
||||
GridwiseElementwise1dFunctor::Run(in_grid_1d_desc_tuple,
|
||||
out_grid_1d_desc_tuple,
|
||||
p_in_global_tuple,
|
||||
p_out_global_tuple,
|
||||
elementwise_op);
|
||||
}
|
||||
|
||||
template <typename InGrid1dDescTuple,
|
||||
typename OutGrid1dDescTuple,
|
||||
typename InDataTypePointerTuple,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t MPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct GridwiseElementwise_1D
|
||||
{
|
||||
static constexpr index_t NumInput = InDataTypePointerTuple::Size();
|
||||
static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
|
||||
|
||||
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
|
||||
NumOutput == OutScalarPerVectorSeq::Size() &&
|
||||
NumInput == InGrid1dDescTuple::Size() &&
|
||||
NumOutput == OutGrid1dDescTuple::Size(),
|
||||
"Tuple size is inconsistent with the number of in/out!");
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
|
||||
static constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
__device__ static void Run(const InGrid1dDescTuple in_grid_1d_desc_tuple,
|
||||
const OutGrid1dDescTuple out_grid_1d_desc_tuple,
|
||||
const InDataTypePointerTuple p_in_global_tuple,
|
||||
const OutDataTypePointerTuple p_out_global_tuple,
|
||||
const ElementwiseOperation elementwise_op)
|
||||
{
|
||||
const index_t thread_global_id = get_thread_global_1d_id();
|
||||
|
||||
auto in_thread_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_cv_t<remove_pointer_t<DataTypePointer>>;
|
||||
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_thread_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_pointer_t<DataTypePointer>;
|
||||
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
auto in_global_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_global_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
|
||||
|
||||
const index_t blockSize = get_block_size();
|
||||
const index_t blockPerGrid = get_grid_size();
|
||||
const auto M = in_grid_1d_desc_tuple[I0].GetLength(I0);
|
||||
const index_t loop_step = blockPerGrid * blockSize * MPerThread;
|
||||
const auto loop_step_index = make_multi_index(loop_step);
|
||||
|
||||
auto in_global_load_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_cv_t<remove_pointer_t<DataTypePointer>>;
|
||||
|
||||
return ThreadwiseTensorSliceTransfer_v2<DataType,
|
||||
DataType,
|
||||
decltype(in_grid_1d_desc_tuple[I]),
|
||||
decltype(thread_buffer_desc_m),
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
InScalarPerVectorSeq::At(
|
||||
I), // ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{in_grid_1d_desc_tuple[I],
|
||||
thread_global_offset};
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_global_store_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_pointer_t<DataTypePointer>;
|
||||
|
||||
return ThreadwiseTensorSliceTransfer_v1r3<DataType,
|
||||
DataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
decltype(out_grid_1d_desc_tuple[I]),
|
||||
PassThroughOp,
|
||||
Sequence<MPerThread>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
OutScalarPerVectorSeq::At(I),
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
false>(
|
||||
out_grid_1d_desc_tuple[I], thread_global_offset, PassThroughOp{});
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
index_t num_iter = M / (loop_step);
|
||||
do
|
||||
{
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
in_global_load_tuple(I).Run(in_grid_1d_desc_tuple[I],
|
||||
in_global_buf_tuple[I],
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
in_thread_buf_tuple(I));
|
||||
|
||||
in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_1d_desc_tuple[I],
|
||||
loop_step_index);
|
||||
});
|
||||
|
||||
static_for<0, MPerThread, 1>{}([&](auto iM) {
|
||||
// get reference to in data
|
||||
const auto in_data_refs = generate_tie(
|
||||
// return type should be lvalue
|
||||
[&](auto I) -> const auto& { return in_thread_buf_tuple(I)(iM); },
|
||||
Number<NumInput>{});
|
||||
|
||||
// get reference to dst data
|
||||
auto out_data_refs = generate_tie(
|
||||
// return type should be lvalue
|
||||
[&](auto I) -> auto& { return out_thread_buf_tuple(I)(iM); },
|
||||
Number<NumOutput>{});
|
||||
|
||||
unpack2(elementwise_op, out_data_refs, in_data_refs);
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
out_global_store_tuple(I).Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
out_thread_buf_tuple[I],
|
||||
out_grid_1d_desc_tuple[I],
|
||||
out_global_buf_tuple(I));
|
||||
|
||||
out_global_store_tuple(I).MoveDstSliceWindow(out_grid_1d_desc_tuple[I],
|
||||
loop_step_index);
|
||||
});
|
||||
} while(--num_iter);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,86 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename Grid1dBufferDescTuple,
|
||||
index_t NumBuffer,
|
||||
index_t BlockSize,
|
||||
typename DataTypePointerTuple,
|
||||
typename DataTypeTuple>
|
||||
__global__ void
|
||||
kernel_multiple_buffer_set_value(const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple,
|
||||
DataTypePointerTuple p_global_tuple,
|
||||
DataTypeTuple value_tuple)
|
||||
|
||||
{
|
||||
static_assert(NumBuffer == DataTypePointerTuple::Size() && NumBuffer == DataTypeTuple::Size(),
|
||||
"The tuple size should be same as NumBuffer!");
|
||||
|
||||
static_for<0, NumBuffer, 1>{}([&](auto iB) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(DataTypePointerTuple{}[iB])>;
|
||||
using DataTypeFromPointer = remove_pointer_t<DataTypePointer>;
|
||||
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
|
||||
|
||||
static_assert(is_same<DataType, DataTypeFromPointer>::value,
|
||||
"Types in tuples does not match!");
|
||||
});
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
const index_t thread_global_id = get_thread_global_1d_id();
|
||||
|
||||
auto value_buf_tuple = generate_tuple(
|
||||
[&](auto iB) {
|
||||
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
|
||||
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true>{};
|
||||
},
|
||||
Number<NumBuffer>{});
|
||||
|
||||
static_for<0, NumBuffer, 1>{}([&](auto iB) {
|
||||
static_for<0, 1, 1>{}([&](auto J) { value_buf_tuple(iB)(J) = value_tuple[iB]; });
|
||||
});
|
||||
|
||||
auto global_buf_tuple = generate_tuple(
|
||||
[&](auto iB) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_global_tuple(iB), grid_1d_buffer_desc_tuple[iB].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumBuffer>{});
|
||||
|
||||
constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
|
||||
|
||||
static_for<0, NumBuffer, 1>{}([&](auto iB) {
|
||||
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
auto threadwise_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<DataType,
|
||||
DataType,
|
||||
decltype(val_buff_desc),
|
||||
decltype(Grid1dBufferDescTuple{}[iB]),
|
||||
PassThroughOp,
|
||||
Sequence<1>,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
grid_1d_buffer_desc_tuple[iB], make_multi_index(thread_global_id), PassThroughOp{});
|
||||
|
||||
threadwise_store.Run(val_buff_desc,
|
||||
make_tuple(I0),
|
||||
value_buf_tuple(iB),
|
||||
grid_1d_buffer_desc_tuple[iB],
|
||||
global_buf_tuple(iB));
|
||||
});
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,132 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseUEltwise,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename GridDesc_M0,
|
||||
typename ElementwiseFunctor>
|
||||
__global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global,
|
||||
BDataType* __restrict__ p_b_global,
|
||||
const GridDesc_M0 a_grid_desc_m0,
|
||||
const GridDesc_M0 b_grid_desc_m0,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename GridDesc_M0,
|
||||
typename ElementwiseFunctor,
|
||||
index_t ScalarPerVector>
|
||||
struct GridwiseUnaryElementwise_1D
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto thread_desc_m0 =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
|
||||
|
||||
using PassThrough = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static __device__ auto CalculateElementwiseIndex()
|
||||
{
|
||||
const index_t global_thread_id = get_thread_global_1d_id();
|
||||
return make_multi_index(global_thread_id * ScalarPerVector);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0,
|
||||
const GridDesc_M0 b_grid_desc_m0)
|
||||
{
|
||||
return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size)
|
||||
{
|
||||
const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector);
|
||||
|
||||
return grid_size;
|
||||
}
|
||||
|
||||
__device__ static void Run(const ADataType* __restrict__ p_a_global,
|
||||
BDataType* __restrict__ p_b_global,
|
||||
const GridDesc_M0 a_grid_desc_m0,
|
||||
const GridDesc_M0 b_grid_desc_m0,
|
||||
const ElementwiseFunctor functor)
|
||||
{
|
||||
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_a_global, a_grid_desc_m0.GetElementSpaceSize());
|
||||
auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_b_global, b_grid_desc_m0.GetElementSpaceSize());
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
|
||||
|
||||
const auto thread_store_global_offset = CalculateElementwiseIndex();
|
||||
|
||||
auto a_global_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<ADataType,
|
||||
ADataType,
|
||||
GridDesc_M0,
|
||||
decltype(thread_desc_m0),
|
||||
Sequence<ScalarPerVector>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
ScalarPerVector,
|
||||
1, // SrcScalarStrideInVector
|
||||
false>{a_grid_desc_m0, thread_store_global_offset};
|
||||
|
||||
auto b_global_write =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<BDataType,
|
||||
BDataType,
|
||||
decltype(thread_desc_m0),
|
||||
GridDesc_M0,
|
||||
PassThrough,
|
||||
Sequence<ScalarPerVector>, // SliceLengths
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // DstVectorDim
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1, // DstScalarStrideInVector
|
||||
false>{
|
||||
b_grid_desc_m0, thread_store_global_offset, PassThrough{}};
|
||||
|
||||
const index_t blockSize = get_block_size();
|
||||
const index_t blockPerGrid = get_grid_size();
|
||||
const auto m0 = b_grid_desc_m0.GetLength(I0);
|
||||
const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector;
|
||||
const auto loop_step_index = make_multi_index(loop_step);
|
||||
|
||||
index_t num_iter = m0 / (loop_step);
|
||||
do
|
||||
{
|
||||
// read and process ScalarPerVector elements
|
||||
a_global_load.Run(
|
||||
a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf);
|
||||
|
||||
static_for<0, ScalarPerVector, 1>{}([&](auto m) {
|
||||
constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
|
||||
functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
|
||||
});
|
||||
|
||||
b_global_write.Run(thread_desc_m0,
|
||||
make_tuple(I0), // SrcSliceOriginIdx
|
||||
b_thread_buf,
|
||||
b_grid_desc_m0,
|
||||
b_global_buf);
|
||||
|
||||
a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index);
|
||||
b_global_write.MoveDstSliceWindow(b_grid_desc_m0, loop_step_index);
|
||||
} while(--num_iter);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,259 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace host {
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
struct ReferenceBatchNormFwd_Input_N_H_W_C_Output_C : public device::DeviceBatchNormFwd<4, 3>
|
||||
{
|
||||
struct Argument : public device::BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> yStrides,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
|
||||
const InOutDataType* p_x,
|
||||
const AccDataType* bnScale,
|
||||
const AccDataType* bnBias,
|
||||
InOutDataType* p_y,
|
||||
double exponentialAverageFactor,
|
||||
AccDataType* resultRunningMean,
|
||||
AccDataType* resultRunningVariance,
|
||||
double epsilon,
|
||||
AccDataType* resultSaveMean,
|
||||
AccDataType* resultSaveInvVariance)
|
||||
: p_x_(p_x),
|
||||
bnScale_(bnScale),
|
||||
bnBias_(bnBias),
|
||||
p_y_(p_y),
|
||||
resultRunningMean_(resultRunningMean),
|
||||
resultRunningVariance_(resultRunningVariance),
|
||||
resultSaveMean_(resultSaveMean),
|
||||
resultSaveInvVariance_(resultSaveInvVariance),
|
||||
exponentialAverageFactor_(exponentialAverageFactor),
|
||||
epsilon_(epsilon)
|
||||
{
|
||||
(void)xStrides;
|
||||
(void)yStrides;
|
||||
(void)bnScaleBiasMeanVarStrides;
|
||||
|
||||
if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
|
||||
bnScaleBiasMeanVarLengths[0] != xyLengths[3])
|
||||
throw std::runtime_error("Invalid tensor dimensions!");
|
||||
|
||||
n = xyLengths[0];
|
||||
h = xyLengths[1];
|
||||
w = xyLengths[2];
|
||||
c = xyLengths[3];
|
||||
|
||||
resultSave = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr);
|
||||
resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr);
|
||||
}
|
||||
|
||||
const InOutDataType* p_x_;
|
||||
const AccDataType* bnScale_;
|
||||
const AccDataType* bnBias_;
|
||||
InOutDataType* p_y_;
|
||||
|
||||
AccDataType* resultRunningMean_;
|
||||
AccDataType* resultRunningVariance_;
|
||||
AccDataType* resultSaveMean_;
|
||||
AccDataType* resultSaveInvVariance_;
|
||||
|
||||
bool resultSave, resultRunning;
|
||||
|
||||
index_t n, h, w, c;
|
||||
|
||||
double exponentialAverageFactor_;
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
struct Invoker : public device::BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg)
|
||||
{
|
||||
auto thread_reduce_func = [&](auto iC) {
|
||||
AccDataType reduceSize = type_convert<AccDataType>(arg.n) *
|
||||
type_convert<AccDataType>(arg.h) *
|
||||
type_convert<AccDataType>(arg.w);
|
||||
index_t offset_C = iC;
|
||||
AccDataType mean = type_convert<AccDataType>(0.0f);
|
||||
AccDataType meansquare = type_convert<AccDataType>(0.0f);
|
||||
|
||||
// compute mean, meanquare, variance, invVariance
|
||||
for(index_t iN = 0; iN < arg.n; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h * arg.w * arg.c;
|
||||
for(index_t iH = 0; iH < arg.h; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w * arg.c;
|
||||
for(index_t iW = 0; iW < arg.w; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
mean += x;
|
||||
meansquare += x * x;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
mean = mean / reduceSize;
|
||||
meansquare = meansquare / reduceSize;
|
||||
|
||||
AccDataType variance = meansquare - mean * mean;
|
||||
AccDataType invVariance =
|
||||
type_convert<AccDataType>(1.0f) /
|
||||
std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
|
||||
|
||||
// save the mean/invVariance if required
|
||||
if(arg.resultSave)
|
||||
{
|
||||
arg.resultSaveMean_[iC] = mean;
|
||||
arg.resultSaveInvVariance_[iC] = invVariance;
|
||||
};
|
||||
|
||||
// update the moving average if required
|
||||
if(arg.resultRunning)
|
||||
{
|
||||
arg.resultRunningMean_[iC] =
|
||||
arg.resultRunningMean_[iC] *
|
||||
type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
|
||||
mean * arg.exponentialAverageFactor_;
|
||||
arg.resultRunningVariance_[iC] =
|
||||
arg.resultRunningVariance_[iC] *
|
||||
type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
|
||||
variance * arg.exponentialAverageFactor_;
|
||||
};
|
||||
|
||||
// Normalization
|
||||
for(index_t iN = 0; iN < arg.n; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h * arg.w * arg.c;
|
||||
for(index_t iH = 0; iH < arg.h; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w * arg.c;
|
||||
for(index_t iW = 0; iW < arg.w; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
AccDataType norm_x =
|
||||
arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
|
||||
|
||||
arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
|
||||
};
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t ic_begin = it * work_per_thread;
|
||||
std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
|
||||
{
|
||||
thread_reduce_func(ic);
|
||||
}
|
||||
};
|
||||
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
|
||||
return (0.0f);
|
||||
};
|
||||
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const device::BaseArgument* p_arg) override
|
||||
{
|
||||
(void)p_arg;
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> yStrides,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* bnScale,
|
||||
const void* bnBias,
|
||||
void* p_y,
|
||||
double exponentialAverageFactor,
|
||||
void* resultRunningMean,
|
||||
void* resultRunningVariance,
|
||||
double epsilon,
|
||||
void* resultSaveMean,
|
||||
void* resultSaveInvVariance) override
|
||||
{
|
||||
return std::make_unique<Argument>(xyLengths,
|
||||
xStrides,
|
||||
yStrides,
|
||||
bnScaleBiasMeanVarLengths,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
static_cast<const InOutDataType*>(p_x),
|
||||
static_cast<const AccDataType*>(bnScale),
|
||||
static_cast<const AccDataType*>(bnBias),
|
||||
static_cast<InOutDataType*>(p_y),
|
||||
exponentialAverageFactor,
|
||||
static_cast<AccDataType*>(resultRunningMean),
|
||||
static_cast<AccDataType*>(resultRunningVariance),
|
||||
epsilon,
|
||||
static_cast<AccDataType*>(resultSaveMean),
|
||||
static_cast<AccDataType*>(resultSaveInvVariance));
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace host
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,191 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace host {
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
struct ReferenceBatchNormInfer_Input_N_H_W_C_Output_C : public device::DeviceBatchNormInfer<4, 3>
|
||||
{
|
||||
struct Argument : public device::BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> yStrides,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
|
||||
const InOutDataType* p_x,
|
||||
const AccDataType* bnScale,
|
||||
const AccDataType* bnBias,
|
||||
double epsilon,
|
||||
const AccDataType* estimatedMean,
|
||||
const AccDataType* estimatedVariance,
|
||||
InOutDataType* p_y)
|
||||
: p_x_(p_x),
|
||||
bnScale_(bnScale),
|
||||
bnBias_(bnBias),
|
||||
epsilon_(epsilon),
|
||||
estimatedMean_(estimatedMean),
|
||||
estimatedVariance_(estimatedVariance),
|
||||
p_y_(p_y)
|
||||
{
|
||||
(void)xStrides;
|
||||
(void)yStrides;
|
||||
(void)bnScaleBiasMeanVarStrides;
|
||||
|
||||
if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
|
||||
bnScaleBiasMeanVarLengths[0] != xyLengths[3])
|
||||
throw std::runtime_error("Invalid tensor dimensions!");
|
||||
|
||||
n = xyLengths[0];
|
||||
h = xyLengths[1];
|
||||
w = xyLengths[2];
|
||||
c = xyLengths[3];
|
||||
}
|
||||
|
||||
const InOutDataType* p_x_;
|
||||
const AccDataType* bnScale_;
|
||||
const AccDataType* bnBias_;
|
||||
|
||||
double epsilon_;
|
||||
|
||||
const AccDataType* estimatedMean_;
|
||||
const AccDataType* estimatedVariance_;
|
||||
|
||||
InOutDataType* p_y_;
|
||||
|
||||
index_t n, h, w, c;
|
||||
};
|
||||
|
||||
struct Invoker : public device::BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg)
|
||||
{
|
||||
auto thread_reduce_func = [&](auto iC) {
|
||||
index_t offset_C = iC;
|
||||
AccDataType mean = arg.estimatedMean_[offset_C];
|
||||
AccDataType variance = arg.estimatedVariance_[offset_C];
|
||||
|
||||
AccDataType invVariance =
|
||||
type_convert<AccDataType>(1.0f) /
|
||||
std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
|
||||
|
||||
// Normalization
|
||||
for(index_t iN = 0; iN < arg.n; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h * arg.w * arg.c;
|
||||
for(index_t iH = 0; iH < arg.h; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w * arg.c;
|
||||
for(index_t iW = 0; iW < arg.w; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
AccDataType norm_x =
|
||||
arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
|
||||
|
||||
arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
|
||||
};
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t ic_begin = it * work_per_thread;
|
||||
std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
|
||||
{
|
||||
thread_reduce_func(ic);
|
||||
}
|
||||
};
|
||||
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
|
||||
return (0.0f);
|
||||
};
|
||||
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const device::BaseArgument* p_arg) override
|
||||
{
|
||||
(void)p_arg;
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> yStrides,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* bnScale,
|
||||
const void* bnBias,
|
||||
double epsilon,
|
||||
const void* estimatedMean,
|
||||
const void* estimatedVariance,
|
||||
void* p_y) override
|
||||
{
|
||||
return std::make_unique<Argument>(xyLengths,
|
||||
xStrides,
|
||||
yStrides,
|
||||
bnScaleBiasMeanVarLengths,
|
||||
bnScaleBiasMeanVarStrides,
|
||||
static_cast<const InOutDataType*>(p_x),
|
||||
static_cast<const AccDataType*>(bnScale),
|
||||
static_cast<const AccDataType*>(bnBias),
|
||||
epsilon,
|
||||
static_cast<const AccDataType*>(estimatedMean),
|
||||
static_cast<const AccDataType*>(estimatedVariance),
|
||||
static_cast<InOutDataType*>(p_y));
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace host
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -17,9 +17,12 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using DeviceNormalizeFromMeanMeanSquarePtr =
|
||||
ck::tensor_operation::device::DeviceElementwisePtr<5, 1, 2, Normalize>;
|
||||
using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr<
|
||||
Tuple<half_t, float, float, half_t, half_t>,
|
||||
Tuple<half_t>,
|
||||
Normalize,
|
||||
2>;
|
||||
|
||||
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceNormalizeFromMeanMeanSquarePtr>& instances);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
@@ -126,6 +127,23 @@ struct GeneratorTensor_3<ck::bhalf_t>
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_4
|
||||
{
|
||||
std::default_random_engine generator;
|
||||
std::normal_distribution<float> distribution;
|
||||
|
||||
GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){};
|
||||
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
float tmp = distribution(generator);
|
||||
|
||||
return ck::type_convert<T>(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor_Checkboard
|
||||
{
|
||||
template <typename... Ts>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -27,19 +27,17 @@ using outputType = F16;
|
||||
using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
|
||||
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
|
||||
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
|
||||
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
|
||||
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 8, 8, 1, 1, 8, 8, 8 >,
|
||||
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 4, 4, 1, 1, 4, 4, 4 >,
|
||||
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 2, 2, 1, 1, 2, 2, 2 >,
|
||||
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 1, 1, 1, 1, 1, 1, 1 >
|
||||
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceElementwisePtr<5, 1, 2, Normalize>>& instances)
|
||||
std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
|
||||
|
||||
Reference in New Issue
Block a user