mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Transpose 3d (#984)
* added working example for 5D input using 1D kernel * example with 5D input tensor and 2d kernel - not working: issues with arguments * added updated version of 3d device op - changed descriptors/dims * added example file to check kernel * fixed descriptor and isSupportedArgument stride problem * added and modified kernel for 3d - updated tids/loop * adding some more 5d example files * fixed some issues * changes made for testing * working version: fixed error in stride for A, still a bit inefficient * cleaned up formatting/comments * updating formatting * more formatting fixes * fixing cmake, adding back gpu targets in cmake script * adding client example * added instances for client example * fixed errors in client example * implemented client ex with device_elementwise.hpp and device_elementwise_3d_impl.hpp * removed extra files * minor formatting and naming fixes * adding test files and profiler * fixing minor error * minor fix * removed unneccesary comments, renamed files * updated instance list for client example, added different layout example * removing instances * fixed error in instance generation * remove comments * update profiler and client example tensor layouts * fixed errors in test/profiler * updated vector dim access to enable vector load * updated test/profiler files * updated example with 1d kernel * updating profiler * renamed files --------- Co-authored-by: Jing Zhang <jizha@amd.com>
This commit is contained in:
2
client_example/23_elementwise_transpose/CMakeLists.txt
Normal file
2
client_example/23_elementwise_transpose/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
|
||||
target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_operations)
|
||||
@@ -0,0 +1,139 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
const int N = 16;
|
||||
const int C = 8;
|
||||
const int D = 8;
|
||||
const int H = 8;
|
||||
const int W = 8;
|
||||
|
||||
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
|
||||
std::vector<std::size_t> nchwd = {N, C, H, W, D};
|
||||
auto size = N * C * D * H * W;
|
||||
|
||||
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
|
||||
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
|
||||
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D
|
||||
|
||||
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
|
||||
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
|
||||
|
||||
std::array<const void*, 1> input = {a_dev_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_dev_buf.GetDeviceBuffer()};
|
||||
|
||||
using DeviceElementwisePermuteInstance = ck::tensor_operation::device::
|
||||
DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, PassThrough, 5>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceElementwisePermuteInstance>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte =
|
||||
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
|
||||
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,2 +1,4 @@
|
||||
add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
|
||||
add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
|
||||
add_example_executable(example_elementwise_permute elementwise_permute.cpp)
|
||||
add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
|
||||
|
||||
135
example/44_elementwise_permute/elementwise_permute.cpp
Normal file
135
example/44_elementwise_permute/elementwise_permute.cpp
Normal file
@@ -0,0 +1,135 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
|
||||
ck::Tuple<BDataType>, // OutDataTypeTuple
|
||||
PassThrough, // ElementwiseOp
|
||||
5, // NumDim
|
||||
8, // MPerThread
|
||||
ck::Sequence<1>, // InScalarPerVectorSeq
|
||||
ck::Sequence<1>>; // OutScalarPerVectorSeq
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
|
||||
{
|
||||
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
|
||||
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
|
||||
for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
|
||||
for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
|
||||
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
|
||||
{
|
||||
auto a_val = A_ncdhw(n, c, d, h, w);
|
||||
functor(B_ndhwc(n, d, h, w, c), a_val);
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
std::vector<std::size_t> ncdhw = {16, 8, 8, 8, 8};
|
||||
std::vector<std::size_t> ndhwc = {16, 8, 8, 8, 8};
|
||||
Tensor<ADataType> a(ncdhw);
|
||||
Tensor<BDataType> b(ndhwc);
|
||||
|
||||
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::array<ck::index_t, 5> ab_lengths;
|
||||
/**std::array<ck::index_t, 5> a_strides = {
|
||||
static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
|
||||
static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
|
||||
static_cast<int>(ncdhw[3] * ncdhw[4]),
|
||||
static_cast<int>(ncdhw[4]),
|
||||
1};
|
||||
std::array<ck::index_t, 5> b_strides = {
|
||||
static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
|
||||
static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
|
||||
1,
|
||||
static_cast<int>(ndhwc[3] * ndhwc[4]),
|
||||
static_cast<int>(ndhwc[4])};**/
|
||||
|
||||
std::array<ck::index_t, 5> a_strides = {
|
||||
static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
|
||||
static_cast<int>(ncdhw[3] * ncdhw[4]),
|
||||
static_cast<int>(ncdhw[4]),
|
||||
1,
|
||||
static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4])};
|
||||
|
||||
std::array<ck::index_t, 5> b_strides = {
|
||||
static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
|
||||
static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
|
||||
static_cast<int>(ndhwc[3] * ndhwc[4]),
|
||||
static_cast<int>(ndhwc[4]),
|
||||
1};
|
||||
ck::ranges::copy(ncdhw, ab_lengths.begin());
|
||||
|
||||
auto broadcastPermute = DeviceElementwisePermuteInstance{};
|
||||
auto argument = broadcastPermute.MakeArgumentPointer(
|
||||
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
|
||||
|
||||
if(!broadcastPermute.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
std::cout << "A (ncdhw): " << a.mDesc << std::endl;
|
||||
std::cout << "B (ndhwc): " << b.mDesc << std::endl;
|
||||
|
||||
auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
|
||||
float ave_time =
|
||||
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
|
||||
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
Tensor<BDataType> host_b(ndhwc);
|
||||
host_elementwise4D(host_b, a, PassThrough{});
|
||||
|
||||
pass &=
|
||||
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
}
|
||||
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
120
example/44_elementwise_permute/elementwise_permute_3d.cpp
Normal file
120
example/44_elementwise_permute/elementwise_permute_3d.cpp
Normal file
@@ -0,0 +1,120 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
|
||||
ck::Tuple<BDataType>, // OutDataTypeTuple
|
||||
PassThrough, // ElementwiseOp
|
||||
2, // NumDim_m, {N, C}
|
||||
2, // NumDim_n, {H, W}
|
||||
1, // NumDim_k, {D}
|
||||
8, // MPerThread
|
||||
8, // NPerThread
|
||||
8, // KPerThread
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<4>>; // OutScalarPerVectorSeq
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
|
||||
{
|
||||
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
|
||||
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
|
||||
for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
|
||||
for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
|
||||
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
|
||||
{
|
||||
auto a_val = A_ncdhw(n, c, d, h, w);
|
||||
functor(B_ndhwc(n, d, h, w, c), a_val);
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
const int N = 4;
|
||||
const int C = 16;
|
||||
const int H = 32;
|
||||
const int W = 5;
|
||||
const int D = 16;
|
||||
|
||||
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
|
||||
std::vector<std::size_t> ndhwc = {N, D, H, W, C};
|
||||
Tensor<ADataType> a(ncdhw);
|
||||
Tensor<BDataType> b(ndhwc);
|
||||
|
||||
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
|
||||
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
|
||||
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
|
||||
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
|
||||
|
||||
auto broadcastPermute = DeviceElementwisePermuteInstance{};
|
||||
auto argument = broadcastPermute.MakeArgumentPointer(
|
||||
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
|
||||
|
||||
if(!broadcastPermute.IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The runtime parameters seems not supported by the device instance, exiting!");
|
||||
};
|
||||
|
||||
std::cout << "A (ncdhw): " << a.mDesc << std::endl;
|
||||
std::cout << "B (ndhwc): " << b.mDesc << std::endl;
|
||||
|
||||
auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
|
||||
float ave_time =
|
||||
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
|
||||
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
Tensor<BDataType> host_b(ndhwc);
|
||||
host_elementwise4D(host_b, a, PassThrough{});
|
||||
|
||||
pass &=
|
||||
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
}
|
||||
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
@@ -19,13 +19,13 @@ using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<1>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
|
||||
ck::Tuple<BDataType>, // OutDataTypeTuple
|
||||
PassThrough, // Elementwise op
|
||||
4, // NumDim
|
||||
8, // MPerThread
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<1>>; // OutScalarPerVectorSeq
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
|
||||
@@ -99,7 +99,6 @@ int main()
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
|
||||
@@ -17,15 +17,15 @@ using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
3, // NumDim_M
|
||||
1, // NumDim_N
|
||||
8,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
|
||||
ck::Tuple<BDataType>, // OutDataTypeTuple
|
||||
PassThrough, // Elementwise op
|
||||
3, // NumDim_M
|
||||
1, // NumDim_N
|
||||
1, // MPerThread
|
||||
1, // NPerThread
|
||||
ck::Sequence<1>, // InScalarPerVectorSeq
|
||||
ck::Sequence<1>>; // OutScalarPerVectorSeq
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nhwc,
|
||||
@@ -53,12 +53,6 @@ int main()
|
||||
const int H = 32;
|
||||
const int W = 1024;
|
||||
|
||||
/**const int N = 120;
|
||||
const int H = 32;
|
||||
const int W = 64;
|
||||
|
||||
const int C = 128;**/
|
||||
|
||||
std::vector<std::size_t> nchw = {N, C, H, W};
|
||||
std::vector<std::size_t> nhwc = {N, H, W, C};
|
||||
|
||||
@@ -71,7 +65,6 @@ int main()
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
// LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
@@ -115,13 +108,10 @@ int main()
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
|
||||
|
||||
Tensor<BDataType> host_b(nhwc);
|
||||
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
|
||||
host_b, a, nchw, PassThrough{});
|
||||
|
||||
// LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
|
||||
pass &=
|
||||
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,364 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/stream_utility.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim_m, // choose how to set dims
|
||||
index_t NumDim_n,
|
||||
index_t NumDim_k,
|
||||
index_t MPerThread,
|
||||
index_t NPerThread,
|
||||
index_t KPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
OutDataTypeTuple,
|
||||
ElementwiseOperation,
|
||||
NumDim_m + NumDim_n + NumDim_k>
|
||||
{
|
||||
static constexpr index_t NumDim = NumDim_m + NumDim_n + NumDim_k;
|
||||
|
||||
static constexpr int NumInput = InDataTypeTuple::Size();
|
||||
static constexpr int NumOutput = OutDataTypeTuple::Size();
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
static constexpr auto I4 = Number<4>{};
|
||||
|
||||
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
|
||||
NumOutput == OutScalarPerVectorSeq::Size(),
|
||||
"Tuple size is inconsistent with the number of in/out!");
|
||||
|
||||
static auto GenerateInDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<const DataType*>(nullptr);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
}
|
||||
|
||||
static auto GenerateOutDataTypePointerTuple()
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
|
||||
return static_cast<DataType*>(nullptr);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
}
|
||||
|
||||
using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple());
|
||||
using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
|
||||
|
||||
template <typename Desc_MNK>
|
||||
static auto PadDescriptor_MNK(Desc_MNK desc_mnk,
|
||||
index_t gridSize,
|
||||
index_t blockSize,
|
||||
index_t num_threads_m,
|
||||
index_t num_threads_n,
|
||||
index_t num_threads_k)
|
||||
{
|
||||
std::ignore = blockSize;
|
||||
std::ignore = gridSize;
|
||||
|
||||
const auto m = desc_mnk.GetLength(I0);
|
||||
const auto n = desc_mnk.GetLength(I1);
|
||||
const auto k = desc_mnk.GetLength(I2);
|
||||
|
||||
const index_t loop_step_m = num_threads_m * MPerThread;
|
||||
const index_t loop_step_n = num_threads_n * NPerThread;
|
||||
const index_t loop_step_k = num_threads_k * KPerThread;
|
||||
|
||||
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
|
||||
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
|
||||
const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
|
||||
|
||||
const auto desc_mnk_pad =
|
||||
transform_tensor_descriptor(desc_mnk,
|
||||
make_tuple(make_right_pad_transform(m, pad_m),
|
||||
make_right_pad_transform(n, pad_n),
|
||||
make_right_pad_transform(k, pad_k)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
|
||||
return desc_mnk_pad;
|
||||
}
|
||||
|
||||
static auto MakeDescriptor_MNK(const std::array<index_t, NumDim>& lengths,
|
||||
const std::array<index_t, NumDim>& stride,
|
||||
index_t gridSize,
|
||||
index_t blockSize,
|
||||
index_t num_threads_m,
|
||||
index_t num_threads_n,
|
||||
index_t num_threads_k)
|
||||
{
|
||||
auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
|
||||
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
|
||||
|
||||
// nd desc - [s0, s1, s2, ...]
|
||||
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
|
||||
|
||||
constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type();
|
||||
constexpr auto nDimIds =
|
||||
typename arithmetic_sequence_gen<NumDim_m, NumDim_m + NumDim_n, 1>::type();
|
||||
constexpr auto kDimIds =
|
||||
typename arithmetic_sequence_gen<NumDim_m + NumDim_n, NumDim, 1>::type();
|
||||
|
||||
const auto mLengths = get_container_subset(tupleOfShape, mDimIds);
|
||||
const auto nLengths = get_container_subset(tupleOfShape, nDimIds);
|
||||
const auto kLengths = get_container_subset(tupleOfShape, kDimIds);
|
||||
|
||||
// merge nd to 3d desc - [s0 * s1 * ...]
|
||||
if constexpr(NumDim > 3)
|
||||
{
|
||||
const auto desc_mnk = transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform(mLengths),
|
||||
make_merge_transform(nLengths),
|
||||
make_merge_transform(kLengths)),
|
||||
make_tuple(mDimIds, nDimIds, kDimIds),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
|
||||
|
||||
return PadDescriptor_MNK(
|
||||
desc_mnk, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
|
||||
}
|
||||
else
|
||||
return PadDescriptor_MNK(
|
||||
desc, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k);
|
||||
}
|
||||
|
||||
template <index_t TupleSize>
|
||||
static auto GenerateInOutGrid3dDescTuple(Number<TupleSize>)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto) {
|
||||
if constexpr(NumDim > 3)
|
||||
{
|
||||
return MakeDescriptor_MNK({1, 1, 1}, {1, 1, 1}, 1, 1, 1, 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
return MakeDescriptor_MNK({1}, {1}, 1, 1, 1, 1, 1);
|
||||
};
|
||||
},
|
||||
Number<TupleSize>{});
|
||||
}
|
||||
|
||||
using OutGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number<NumOutput>{}));
|
||||
using InGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number<NumInput>{}));
|
||||
|
||||
using GridwiseElementwise = GridwiseElementwise_3D<InGrid3dDescTuple,
|
||||
OutGrid3dDescTuple,
|
||||
InDataTypePointerTuple,
|
||||
OutDataTypePointerTuple,
|
||||
ElementwiseOperation,
|
||||
MPerThread,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
InScalarPerVectorSeq,
|
||||
OutScalarPerVectorSeq>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, NumDim> lengths,
|
||||
const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
|
||||
const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
|
||||
const std::array<const void*, NumInput> in_dev_buffers,
|
||||
const std::array<void*, NumOutput> out_dev_buffers,
|
||||
ElementwiseOperation elementwise_op)
|
||||
|
||||
: lengths_(lengths),
|
||||
inStridesArray_(inStridesArray),
|
||||
outStridesArray_(outStridesArray),
|
||||
elementwise_op_(elementwise_op),
|
||||
blockSize_(256)
|
||||
{
|
||||
static_assert(NumDim_m > 0, "");
|
||||
static_assert(NumDim_n > 0, "");
|
||||
static_assert(NumDim_k > 0, "");
|
||||
|
||||
in_dev_buffers_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
|
||||
return static_cast<const DataType*>(in_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
out_dev_buffers_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
|
||||
return static_cast<DataType*>(out_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
}
|
||||
|
||||
InDataTypePointerTuple in_dev_buffers_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
|
||||
std::array<index_t, NumDim> lengths_;
|
||||
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
|
||||
std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
|
||||
|
||||
ElementwiseOperation elementwise_op_;
|
||||
index_t blockSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
index_t gridSize = getAvailableComputeUnitCount(stream_config) * arg.blockSize_;
|
||||
index_t num_threads_m = gridSize / (16 * 16);
|
||||
index_t num_threads_n = 16;
|
||||
index_t num_threads_k = 16;
|
||||
|
||||
auto in_grid_3d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MNK(arg.lengths_,
|
||||
arg.inStridesArray_[I.value],
|
||||
gridSize,
|
||||
arg.blockSize_,
|
||||
num_threads_m,
|
||||
num_threads_n,
|
||||
num_threads_k);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_grid_3d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MNK(arg.lengths_,
|
||||
arg.outStridesArray_[I.value],
|
||||
gridSize,
|
||||
arg.blockSize_,
|
||||
num_threads_m,
|
||||
num_threads_n,
|
||||
num_threads_k);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
const auto kernel = kernel_elementwise_3d<GridwiseElementwise,
|
||||
InGrid3dDescTuple,
|
||||
OutGrid3dDescTuple,
|
||||
InDataTypePointerTuple,
|
||||
OutDataTypePointerTuple,
|
||||
ElementwiseOperation>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(gridSize),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
in_grid_3d_desc_tuple,
|
||||
out_grid_3d_desc_tuple,
|
||||
arg.in_dev_buffers_,
|
||||
arg.out_dev_buffers_,
|
||||
arg.elementwise_op_,
|
||||
num_threads_m,
|
||||
num_threads_n,
|
||||
num_threads_k);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg == nullptr)
|
||||
return false;
|
||||
|
||||
if(pArg->lengths_.back() % MPerThread != 0)
|
||||
return false;
|
||||
|
||||
auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
|
||||
const std::array<index_t, NumDim>& strides,
|
||||
index_t scalarPerVector,
|
||||
index_t vectorDim) {
|
||||
if(strides[vectorDim] == 1 &&
|
||||
(lengths[vectorDim] % scalarPerVector == 0 ||
|
||||
lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if(strides[vectorDim] >= scalarPerVector)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
bool valid = true;
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
valid = valid && IsScalarPerVectorValid(pArg->lengths_,
|
||||
pArg->inStridesArray_[I.value],
|
||||
InScalarPerVectorSeq::At(I),
|
||||
NumDim_m - 1);
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
valid = valid && IsScalarPerVectorValid(pArg->lengths_,
|
||||
pArg->outStridesArray_[I.value],
|
||||
OutScalarPerVectorSeq::At(I),
|
||||
NumDim - 1);
|
||||
});
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
|
||||
const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
|
||||
const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
|
||||
const std::array<const void*, NumInput> in_dev_buffers,
|
||||
const std::array<void*, NumOutput> out_dev_buffers,
|
||||
ElementwiseOperation elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(lengths,
|
||||
inStridesArray,
|
||||
outStridesArray,
|
||||
in_dev_buffers,
|
||||
out_dev_buffers,
|
||||
elementwise_op);
|
||||
}
|
||||
|
||||
static auto MakeInvoker() { return Invoker{}; }
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
}
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
264
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
Normal file
264
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
Normal file
@@ -0,0 +1,264 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseElementwise3dFunctor,
|
||||
typename InGrid3dDescTuple,
|
||||
typename OutGrid3dDescTuple,
|
||||
typename InDataTypePointerTuple,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename ElementwiseOperation>
|
||||
__global__ void kernel_elementwise_3d(const InGrid3dDescTuple in_grid_3d_desc_tuple,
|
||||
const OutGrid3dDescTuple out_grid_3d_desc_tuple,
|
||||
const InDataTypePointerTuple p_in_global_tuple,
|
||||
const OutDataTypePointerTuple p_out_global_tuple,
|
||||
const ElementwiseOperation elementwise_op,
|
||||
const index_t num_threads_m,
|
||||
const index_t num_threads_n,
|
||||
const index_t num_threads_k)
|
||||
{
|
||||
GridwiseElementwise3dFunctor::Run(in_grid_3d_desc_tuple,
|
||||
out_grid_3d_desc_tuple,
|
||||
p_in_global_tuple,
|
||||
p_out_global_tuple,
|
||||
elementwise_op,
|
||||
num_threads_m,
|
||||
num_threads_n,
|
||||
num_threads_k);
|
||||
}
|
||||
|
||||
template <typename InGrid3dDescTuple,
|
||||
typename OutGrid3dDescTuple,
|
||||
typename InDataTypePointerTuple,
|
||||
typename OutDataTypePointerTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t MPerThread,
|
||||
index_t NPerThread,
|
||||
index_t KPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct GridwiseElementwise_3D
|
||||
{
|
||||
static constexpr index_t NumInput = InDataTypePointerTuple::Size();
|
||||
static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
|
||||
|
||||
static_assert(NumInput == InScalarPerVectorSeq::Size() &&
|
||||
NumOutput == OutScalarPerVectorSeq::Size() &&
|
||||
NumInput == InGrid3dDescTuple::Size() &&
|
||||
NumOutput == OutGrid3dDescTuple::Size(),
|
||||
"Tuple size is inconsistent with the number of in/out!");
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
|
||||
static constexpr auto thread_buffer_desc_mnk = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MPerThread>{}, Number<NPerThread>{}, Number<KPerThread>{}));
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
__device__ static void Run(const InGrid3dDescTuple in_grid_3d_desc_tuple,
|
||||
const OutGrid3dDescTuple out_grid_3d_desc_tuple,
|
||||
const InDataTypePointerTuple p_in_global_tuple,
|
||||
const OutDataTypePointerTuple p_out_global_tuple,
|
||||
const ElementwiseOperation elementwise_op,
|
||||
const index_t num_threads_m,
|
||||
const index_t num_threads_n,
|
||||
const index_t num_threads_k)
|
||||
{
|
||||
auto in_thread_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_cv_t<remove_pointer_t<DataTypePointer>>;
|
||||
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
DataType,
|
||||
MPerThread * NPerThread * KPerThread,
|
||||
true>{};
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_thread_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_pointer_t<DataTypePointer>;
|
||||
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
DataType,
|
||||
MPerThread * NPerThread * KPerThread,
|
||||
true>{};
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
auto in_global_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_global_tuple[I], in_grid_3d_desc_tuple[I].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_global_buf_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_global_tuple[I], out_grid_3d_desc_tuple[I].GetElementSpaceSize());
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
const auto M = in_grid_3d_desc_tuple[I0].GetLength(I0);
|
||||
const auto N = in_grid_3d_desc_tuple[I0].GetLength(I1);
|
||||
const auto K = in_grid_3d_desc_tuple[I0].GetLength(I2);
|
||||
|
||||
const index_t loop_step_m = num_threads_m * MPerThread;
|
||||
const index_t loop_step_n = num_threads_n * NPerThread;
|
||||
const index_t loop_step_k = num_threads_k * KPerThread;
|
||||
|
||||
const index_t thread_1d_id = get_thread_global_1d_id();
|
||||
|
||||
const index_t tid_m = thread_1d_id / (num_threads_n * num_threads_k);
|
||||
const index_t tid_nk = thread_1d_id % (num_threads_n * num_threads_k);
|
||||
const index_t tid_n = tid_nk / num_threads_k;
|
||||
const index_t tid_k = tid_nk % num_threads_k;
|
||||
|
||||
const auto thread_global_offset =
|
||||
make_multi_index(tid_m * MPerThread, tid_n * NPerThread, tid_k * KPerThread);
|
||||
|
||||
auto in_global_load_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_cv_t<remove_pointer_t<DataTypePointer>>;
|
||||
|
||||
return ThreadwiseTensorSliceTransfer_v2<
|
||||
DataType,
|
||||
DataType,
|
||||
decltype(in_grid_3d_desc_tuple[I]),
|
||||
decltype(thread_buffer_desc_mnk),
|
||||
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
|
||||
Sequence<0, 1, 2>, // DimAccessOrder
|
||||
01, // SrcVectorDim
|
||||
InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I), //
|
||||
// ScalarPerVector
|
||||
1, // SrcScalarStrideInVector
|
||||
true>{in_grid_3d_desc_tuple[I], thread_global_offset};
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_global_store_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
|
||||
using DataType = remove_pointer_t<DataTypePointer>;
|
||||
|
||||
return ThreadwiseTensorSliceTransfer_v1r3<
|
||||
DataType,
|
||||
DataType,
|
||||
decltype(thread_buffer_desc_mnk),
|
||||
decltype(out_grid_3d_desc_tuple[I]),
|
||||
PassThroughOp,
|
||||
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
|
||||
Sequence<0, 1, 2>, // DimAccessOrder
|
||||
2, // SrcVectorDim
|
||||
OutScalarPerVectorSeq::At(I), // OutScalarPerVectorSeq::At(I),
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
index_t num_iter_m = M / (loop_step_m);
|
||||
do
|
||||
{
|
||||
index_t num_iter_n = N / (loop_step_n);
|
||||
do
|
||||
{
|
||||
index_t num_iter_k = K / (loop_step_k);
|
||||
do
|
||||
{
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
in_global_load_tuple(I).Run(in_grid_3d_desc_tuple[I],
|
||||
in_global_buf_tuple[I],
|
||||
thread_buffer_desc_mnk,
|
||||
make_tuple(I0, I0, I0),
|
||||
in_thread_buf_tuple(I));
|
||||
|
||||
in_global_load_tuple(I).MoveSrcSliceWindow(
|
||||
in_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
|
||||
});
|
||||
|
||||
static_for<0, MPerThread, 1>{}([&](auto iM) {
|
||||
static_for<0, NPerThread, 1>{}([&](auto iN) {
|
||||
static_for<0, KPerThread, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc_mnk.CalculateOffset(make_tuple(iM, iN, iK));
|
||||
// get reference to in data
|
||||
const auto in_data_refs = generate_tie(
|
||||
// return type should be lvalue
|
||||
[&](auto I) -> const auto& {
|
||||
return in_thread_buf_tuple(I)(Number<offset>{});
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
// get referenec to dst data
|
||||
auto out_data_refs = generate_tie(
|
||||
// return type should be lvalue
|
||||
[&](auto I) -> auto& {
|
||||
return out_thread_buf_tuple(I)(Number<offset>{});
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
unpack2(elementwise_op, out_data_refs, in_data_refs);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
out_global_store_tuple(I).Run(thread_buffer_desc_mnk,
|
||||
make_tuple(I0, I0, I0),
|
||||
out_thread_buf_tuple[I],
|
||||
out_grid_3d_desc_tuple[I],
|
||||
out_global_buf_tuple(I));
|
||||
|
||||
out_global_store_tuple(I).MoveDstSliceWindow(
|
||||
out_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k));
|
||||
});
|
||||
} while(--num_iter_k);
|
||||
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
in_global_load_tuple(I).MoveSrcSliceWindow(
|
||||
in_grid_3d_desc_tuple[I],
|
||||
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
out_global_store_tuple(I).MoveDstSliceWindow(
|
||||
out_grid_3d_desc_tuple[I],
|
||||
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
|
||||
});
|
||||
|
||||
} while(--num_iter_n);
|
||||
|
||||
static_for<0, NumInput, 1>{}([&](auto I) {
|
||||
in_global_load_tuple(I).MoveSrcSliceWindow(
|
||||
in_grid_3d_desc_tuple[I],
|
||||
make_multi_index(loop_step_m,
|
||||
-(N / loop_step_n) * loop_step_n,
|
||||
-(K / loop_step_k) * loop_step_k));
|
||||
});
|
||||
|
||||
static_for<0, NumOutput, 1>{}([&](auto I) {
|
||||
out_global_store_tuple(I).MoveDstSliceWindow(
|
||||
out_grid_3d_desc_tuple[I],
|
||||
make_multi_index(loop_step_m,
|
||||
-(N / loop_step_n) * loop_step_n,
|
||||
-(K / loop_step_k) * loop_step_k));
|
||||
});
|
||||
} while(--num_iter_m);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,44 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using device_transpose_f16_instances = std::tuple<
|
||||
// FOR 16, 32, 16, 32, 16
|
||||
// clang-format off
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
using device_transpose_f32_instances = std::tuple<
|
||||
// for 16, 8, 16, 32, 8 -> test with instances for fp16
|
||||
// clang-format off
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
void add_device_transpose_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
|
||||
instances);
|
||||
|
||||
void add_device_transpose_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
|
||||
instances);
|
||||
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::
|
||||
DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>
|
||||
{
|
||||
using DeviceOp =
|
||||
DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_transpose_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_transpose_f16_instances(op_ptrs);
|
||||
}
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,3 @@
|
||||
add_instance_library(device_transpose_instance
|
||||
device_transpose_instances_3d.cpp
|
||||
)
|
||||
@@ -0,0 +1,43 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
void add_device_transpose_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_transpose_f16_instances{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_transpose_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_transpose_f32_instances{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
182
profiler/include/profiler/profile_transpose_impl.hpp
Normal file
182
profiler/include/profiler/profile_transpose_impl.hpp
Normal file
@@ -0,0 +1,182 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <typeinfo>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
|
||||
{
|
||||
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
|
||||
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
|
||||
for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
|
||||
for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
|
||||
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
|
||||
{
|
||||
auto a_val = A_ncdhw(n, c, d, h, w);
|
||||
functor(B_nchwd(n, c, h, w, d), a_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ADataType, typename BDataType, index_t NumDim>
|
||||
bool profile_transpose_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
std::vector<index_t> lengths)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
index_t N = lengths[0];
|
||||
index_t C = lengths[1];
|
||||
index_t D = lengths[2];
|
||||
index_t H = lengths[3];
|
||||
index_t W = lengths[4];
|
||||
|
||||
std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
|
||||
std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
|
||||
Tensor<ADataType> a(ncdhw);
|
||||
Tensor<BDataType> b(ndhwc);
|
||||
Tensor<BDataType> host_b(ndhwc);
|
||||
|
||||
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
|
||||
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
|
||||
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
|
||||
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
|
||||
|
||||
std::cout << "A: " << a.mDesc << std::endl;
|
||||
std::cout << "B: " << b.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
|
||||
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
}
|
||||
|
||||
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
// const auto element_op = ElementOp{};
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
using DeviceOp = ck::tensor_operation::device::
|
||||
DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_elementwise4D(host_b, a, ElementOp{});
|
||||
}
|
||||
|
||||
std::string best_op_name;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
for(auto& op_ptr : op_ptrs)
|
||||
{
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
|
||||
// re-init C to zero before profiling next kernel
|
||||
b_device_buf.SetZero();
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
|
||||
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
|
||||
pass &= ck::utils::check_err(
|
||||
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
|
||||
<< " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
|
||||
<< " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace ck
|
||||
85
profiler/src/profile_transpose.cpp
Normal file
85
profiler/src/profile_transpose.cpp
Normal file
@@ -0,0 +1,85 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct MatrixLayout
|
||||
{
|
||||
NCDHW, // 0
|
||||
NCHWD, // 1
|
||||
};
|
||||
|
||||
enum struct DataType
|
||||
{
|
||||
F32_F32_F32_F32_F32, // 0
|
||||
F16_F16_F16_F16_F16, // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "transpose"
|
||||
#define OP_DESC "Transpose"
|
||||
|
||||
int profile_transpose(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 15)
|
||||
{
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
// printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
|
||||
printf("arg4: verification (0: no; 1: yes)\n");
|
||||
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
|
||||
printf("arg6: print tensor value (0: no; 1: yes)\n");
|
||||
printf("arg7: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg8 to 13: N, C, D, H, W\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
|
||||
// const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
std::vector<index_t> lengths = std::stoi(argv[7]);
|
||||
|
||||
/**const int N = std::stoi(argv[7]);
|
||||
const int C = std::stoi(argv[8]);
|
||||
const int D = std::stoi(argv[9]);
|
||||
const int H = std::stoi(argv[10]);
|
||||
const int W = std::stoi(argv[11]);**/
|
||||
|
||||
using F32 = float;
|
||||
using F16 = ck::half_t;
|
||||
|
||||
auto profile = [&](auto a_type, auto b_type) {
|
||||
using ADataType = decltype(a_type);
|
||||
using BDataType = decltype(b_type);
|
||||
|
||||
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
|
||||
do_verification, init_method, do_log, time_kernel, lengths);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
if(data_type == GemmDataType::F32_F32_F32_F32_F32)
|
||||
{
|
||||
return profile(F32{}, F32{});
|
||||
}
|
||||
else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
|
||||
{
|
||||
return profile(F16{}, F16{});
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);
|
||||
@@ -148,6 +148,7 @@ add_subdirectory(pool)
|
||||
add_subdirectory(batched_gemm_multi_d)
|
||||
add_subdirectory(grouped_convnd_bwd_data)
|
||||
add_subdirectory(conv_tensor_rearrange)
|
||||
add_subdirectory(transpose)
|
||||
if(GPU_TARGETS MATCHES "gfx11")
|
||||
add_subdirectory(wmma_op)
|
||||
endif()
|
||||
|
||||
9
test/transpose/CMakeLists.txt
Normal file
9
test/transpose/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
add_gtest_executable(test_transpose test_transpose.cpp)
|
||||
target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
27
test/transpose/test_transpose.cpp
Normal file
27
test/transpose/test_transpose.cpp
Normal file
@@ -0,0 +1,27 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "test_transpose_util.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestTranspose : public ::testing::Test
|
||||
{
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
using KernelTypes = ::testing::Types<
|
||||
std::tuple< F16, F16>,
|
||||
std::tuple< F32, F32>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
TYPED_TEST_SUITE(TestTranspose, KernelTypes);
|
||||
|
||||
//#include "test_transpose_ut_cases.inc"
|
||||
30
test/transpose/test_transpose_ut_cases.inc
Normal file
30
test/transpose/test_transpose_ut_cases.inc
Normal file
@@ -0,0 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
TYPED_TEST(TestTranspose, Test1)
|
||||
{
|
||||
// for 16, 8, 16, 32, 8
|
||||
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
|
||||
std::vector<index_t> lengths{16, 8, 16, 32, 8};
|
||||
/**constexpr int N = 16;
|
||||
constexpr int C = 8;
|
||||
constexpr int D = 16;
|
||||
constexpr int H = 32;
|
||||
constexpr int W = 8;**/
|
||||
|
||||
this->Run();
|
||||
}
|
||||
|
||||
|
||||
TYPED_TEST(TestTranpose, Test2)
|
||||
{
|
||||
std::vector<int> Ms{127, 255, 312, 799, 1573};
|
||||
std::vector<index_t> lengths{16, 8, 16, 32, 16};
|
||||
/**constexpr int N = 16;
|
||||
constexpr int C = 8;
|
||||
constexpr int D = 16;
|
||||
constexpr int H = 32;
|
||||
constexpr int W = 8;**/
|
||||
|
||||
this->Run();
|
||||
}
|
||||
|
||||
54
test/transpose/test_transpose_util.hpp
Normal file
54
test/transpose/test_transpose_util.hpp
Normal file
@@ -0,0 +1,54 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "include/ck/utility/data_type.hpp"
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
template <typename Tuple>
|
||||
class TestTranspose : public testing::Test
|
||||
{
|
||||
using F32 = float;
|
||||
|
||||
protected:
|
||||
using ADataType = std::tuple_element_t<0, Tuple>;
|
||||
using BDataType = std::tuple_element_t<1, Tuple>;
|
||||
|
||||
public:
|
||||
static constexpr bool verify_ = true;
|
||||
static constexpr int init_method_ = 1; // decimal value initialization
|
||||
static constexpr bool log_ = false;
|
||||
static constexpr bool bench_ = false; // measure kernel performance
|
||||
std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
|
||||
|
||||
void Run()
|
||||
{
|
||||
for(auto length : this->lengths_)
|
||||
{
|
||||
this->RunSingle(length);
|
||||
}
|
||||
}
|
||||
|
||||
void RunSingle()
|
||||
{
|
||||
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
|
||||
verify_, init_method_, log_, bench_, lengths_);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace test
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user