mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Transpose profiler fix (#1114)
* added working example for 5D input using 1D kernel
* example with 5D input tensor and 2d kernel - not working: issues with arguments
* added updated version of 3d device op - changed descriptors/dims
* added example file to check kernel
* fixed descriptor and isSupportedArgument stride problem
* added and modified kernel for 3d - updated tids/loop
* adding some more 5d example files
* fixed some issues
* changes made for testing
* working version: fixed error in stride for A, still a bit inefficient
* cleaned up formatting/comments
* updating formatting
* more formatting fixes
* fixing cmake, adding back gpu targets in cmake script
* adding client example
* added instances for client example
* fixed errors in client example
* implemented client ex with device_elementwise.hpp and device_elementwise_3d_impl.hpp
* removed extra files
* minor formatting and naming fixes
* adding test files and profiler
* fixing minor error
* minor fix
* removed unneccesary comments, renamed files
* updated instance list for client example, added different layout example
* removing instances
* fixed error in instance generation
* remove comments
* update profiler and client example tensor layouts
* fixed errors in test/profiler
* updated vector dim access to enable vector load
* updated test/profiler files
* updated example with 1d kernel
* updating profiler
* renamed files
* disabled device op for MI300
* skip elementwise_permute_2d on gfx94x
* Update CMakeLists.txt
* fixing CMake - disabling some GPU targets
* added transpose profiler to CMake
* fixed transpose profiler errors
* fixed instances for tests/profiler
* cleaned up code in transpose profiler source code
* added some comments, updated copyright
* made function arguments const where possible
---------
Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: Jing Zhang <jizhan@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
[ROCm/composable_kernel commit: aa3e2d7967]
This commit is contained in:
@@ -14,8 +14,8 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using ADataType = F32;
|
||||
using BDataType = F32;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
@@ -25,10 +25,10 @@ using DeviceElementwisePermuteInstance =
|
||||
2, // NumDim_m, {N, C}
|
||||
2, // NumDim_n, {H, W}
|
||||
1, // NumDim_k, {D}
|
||||
8, // MPerThread
|
||||
8, // NPerThread
|
||||
8, // KPerThread
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
4, // MPerThread
|
||||
4, // NPerThread
|
||||
4, // KPerThread
|
||||
ck::Sequence<4>, // InScalarPerVectorSeq
|
||||
ck::Sequence<4>>; // OutScalarPerVectorSeq
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
|
||||
@@ -21,20 +21,19 @@ template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using device_transpose_f16_instances = std::tuple<
|
||||
// FOR 16, 32, 16, 32, 16
|
||||
// clang-format off
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<8>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<4>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 4, 4, 8, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
using device_transpose_f32_instances = std::tuple<
|
||||
// for 16, 8, 16, 32, 8 -> test with instances for fp16
|
||||
// clang-format off
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>>
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<1>>,
|
||||
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<4>>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
|
||||
void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
|
||||
{
|
||||
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
|
||||
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
|
||||
@@ -34,7 +34,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo
|
||||
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
|
||||
{
|
||||
auto a_val = A_ncdhw(n, c, d, h, w);
|
||||
functor(B_nchwd(n, c, h, w, d), a_val);
|
||||
functor(B_ndhwc(n, d, h, w, c), a_val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,8 +77,6 @@ bool profile_transpose_impl(int do_verification,
|
||||
|
||||
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
// const auto element_op = ElementOp{};
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
@@ -118,6 +116,7 @@ bool profile_transpose_impl(int do_verification,
|
||||
// re-init C to zero before profiling next kernel
|
||||
b_device_buf.SetZero();
|
||||
|
||||
// run for verification
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
if(do_verification)
|
||||
@@ -136,6 +135,7 @@ bool profile_transpose_impl(int do_verification,
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
// run for timing purposes
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
@@ -153,10 +153,6 @@ bool profile_transpose_impl(int do_verification,
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
|
||||
pass &= ck::utils::check_err(
|
||||
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
|
||||
@@ -29,6 +29,7 @@ set(PROFILER_SOURCES
|
||||
profile_batchnorm_infer.cpp
|
||||
profile_grouped_conv_bwd_data.cpp
|
||||
profile_conv_tensor_rearrange.cpp
|
||||
profile_transpose.cpp
|
||||
)
|
||||
|
||||
if(DL_KERNELS)
|
||||
@@ -91,6 +92,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
|
||||
|
||||
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
|
||||
|
||||
112
profiler/src/profile_transpose.cpp
Normal file
112
profiler/src/profile_transpose.cpp
Normal file
@@ -0,0 +1,112 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct DataType
|
||||
{
|
||||
F32_F32_F32_F32_F32, // 0
|
||||
F16_F16_F16_F16_F16, // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "transpose"
|
||||
#define OP_DESC "Transpose"
|
||||
|
||||
struct TransposeArgParser
|
||||
{
|
||||
std::unordered_map<std::string, std::vector<int>> long_opts = {{"lengths", {}}};
|
||||
|
||||
bool parse_opt(const int argc, char* argv[], const std::string& key, int i)
|
||||
{
|
||||
if(std::string("--") + key == argv[i])
|
||||
{
|
||||
const int pos = i;
|
||||
while(++i < argc && argv[i][0] != '-') {}
|
||||
int end = i;
|
||||
for(int j = pos + 1; j < end; j++)
|
||||
{
|
||||
long_opts[key].push_back(std::stoi(argv[j]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void operator()(int argc, char* argv[])
|
||||
{
|
||||
for(auto& kv : long_opts)
|
||||
{
|
||||
for(int i = 1; i < argc; i++)
|
||||
{
|
||||
if(parse_opt(argc, argv, kv.first, i))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: verification (0: no; 1: yes)\n");
|
||||
printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n");
|
||||
printf("arg5: print tensor value (0: no; 1: yes)\n");
|
||||
printf("arg6: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg7: --lengths: N, C, D, H, W\n");
|
||||
}
|
||||
|
||||
int profile_transpose(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 7)
|
||||
{
|
||||
print_helper_msg();
|
||||
exit(1);
|
||||
}
|
||||
TransposeArgParser arg_parser;
|
||||
|
||||
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
arg_parser(argc, argv);
|
||||
const std::vector<ck::index_t> lengths = arg_parser.long_opts["lengths"];
|
||||
|
||||
using F32 = float;
|
||||
using F16 = ck::half_t;
|
||||
|
||||
auto profile = [&](auto a_type, auto b_type) {
|
||||
using ADataType = decltype(a_type);
|
||||
using BDataType = decltype(b_type);
|
||||
constexpr ck::index_t NumDim = 5;
|
||||
|
||||
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, NumDim>(
|
||||
do_verification, init_method, do_log, time_kernel, lengths);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
if(data_type == DataType::F32_F32_F32_F32_F32)
|
||||
{
|
||||
return profile(F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16_F16_F16_F16)
|
||||
{
|
||||
return profile(F16{}, F16{});
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose);
|
||||
@@ -1,27 +1,35 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "test_transpose_util.hpp"
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using ck::index_t;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestTranspose : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using ADataType = std::tuple_element_t<0, Tuple>;
|
||||
using BDataType = std::tuple_element_t<1, Tuple>;
|
||||
|
||||
void Run()
|
||||
{
|
||||
std::vector<std::vector<ck::index_t>> lengths = {
|
||||
{4, 16, 16, 32, 5}, {8, 16, 16, 32, 8} /**{32, 16, 16, 32, 8},**/};
|
||||
|
||||
for(auto length : lengths)
|
||||
{
|
||||
bool success = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
|
||||
true, 2, false, false, length);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
using KernelTypes = ::testing::Types<
|
||||
std::tuple< F16, F16>,
|
||||
std::tuple< F32, F32>
|
||||
>;
|
||||
// clang-format on
|
||||
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestTranspose, KernelTypes);
|
||||
|
||||
//#include "test_transpose_ut_cases.inc"
|
||||
TYPED_TEST(TestTranspose, Test_FP16) { this->Run(); }
|
||||
TYPED_TEST(TestTranspose, Test_FP32) { this->Run(); }
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
TYPED_TEST(TestTranspose, Test1)
|
||||
{
|
||||
// for 16, 8, 16, 32, 8
|
||||
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
|
||||
std::vector<index_t> lengths{16, 8, 16, 32, 8};
|
||||
/**constexpr int N = 16;
|
||||
constexpr int C = 8;
|
||||
constexpr int D = 16;
|
||||
constexpr int H = 32;
|
||||
constexpr int W = 8;**/
|
||||
|
||||
this->Run();
|
||||
}
|
||||
|
||||
TYPED_TEST(TestTranpose, Test2)
|
||||
{
|
||||
std::vector<int> Ms{127, 255, 312, 799, 1573};
|
||||
std::vector<index_t> lengths{16, 8, 16, 32, 16};
|
||||
/**constexpr int N = 16;
|
||||
constexpr int C = 8;
|
||||
constexpr int D = 16;
|
||||
constexpr int H = 32;
|
||||
constexpr int W = 8;**/
|
||||
|
||||
this->Run();
|
||||
}
|
||||
@@ -1,54 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "include/ck/utility/data_type.hpp"
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
template <typename Tuple>
|
||||
class TestTranspose : public testing::Test
|
||||
{
|
||||
using F32 = float;
|
||||
|
||||
protected:
|
||||
using ADataType = std::tuple_element_t<0, Tuple>;
|
||||
using BDataType = std::tuple_element_t<1, Tuple>;
|
||||
|
||||
public:
|
||||
static constexpr bool verify_ = true;
|
||||
static constexpr int init_method_ = 1; // decimal value initialization
|
||||
static constexpr bool log_ = false;
|
||||
static constexpr bool bench_ = false; // measure kernel performance
|
||||
std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
|
||||
|
||||
void Run()
|
||||
{
|
||||
for(auto length : this->lengths_)
|
||||
{
|
||||
this->RunSingle(length);
|
||||
}
|
||||
}
|
||||
|
||||
void RunSingle()
|
||||
{
|
||||
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
|
||||
verify_, init_method_, log_, bench_, lengths_);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace test
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user