diff --git a/example/44_elementwise_permute/elementwise_permute_3d.cpp b/example/44_elementwise_permute/elementwise_permute_3d.cpp index 669785a545..b061c0da34 100644 --- a/example/44_elementwise_permute/elementwise_permute_3d.cpp +++ b/example/44_elementwise_permute/elementwise_permute_3d.cpp @@ -14,8 +14,8 @@ using F16 = ck::half_t; using F32 = float; -using ADataType = F16; -using BDataType = F16; +using ADataType = F32; +using BDataType = F32; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using DeviceElementwisePermuteInstance = @@ -25,10 +25,10 @@ using DeviceElementwisePermuteInstance = 2, // NumDim_m, {N, C} 2, // NumDim_n, {H, W} 1, // NumDim_k, {D} - 8, // MPerThread - 8, // NPerThread - 8, // KPerThread - ck::Sequence<8>, // InScalarPerVectorSeq + 4, // MPerThread + 4, // NPerThread + 4, // KPerThread + ck::Sequence<4>, // InScalarPerVectorSeq ck::Sequence<4>>; // OutScalarPerVectorSeq template diff --git a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp index 817e717a89..6ac0871a80 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp @@ -21,20 +21,19 @@ template using S = ck::Sequence; using device_transpose_f16_instances = std::tuple< - // FOR 16, 32, 16, 32, 16 // clang-format off - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>> + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<4>>, + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 8, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>> // clang-format on >; using device_transpose_f32_instances = std::tuple< - // for 16, 8, 16, 32, 8 -> test with instances for fp16 // clang-format off DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>> + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<1>>, + DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<4>> // clang-format on >; diff --git a/profiler/include/profiler/profile_transpose_impl.hpp b/profiler/include/profiler/profile_transpose_impl.hpp index 3dae9ef48b..a4f2cb6763 100644 --- a/profiler/include/profiler/profile_transpose_impl.hpp +++ b/profiler/include/profiler/profile_transpose_impl.hpp @@ -25,7 +25,7 @@ namespace ck { namespace profiler { template -void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor) +void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor) { for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n) for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c) @@ -34,7 +34,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w) { auto a_val = A_ncdhw(n, c, d, h, w); - functor(B_nchwd(n, c, h, w, d), a_val); + functor(B_ndhwc(n, d, h, w, c), a_val); } } @@ -77,8 +77,6 @@ bool profile_transpose_impl(int do_verification, using ElementOp = ck::tensor_operation::element_wise::PassThrough; - // const auto element_op = ElementOp{}; - DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); @@ -118,6 +116,7 @@ bool profile_transpose_impl(int do_verification, // re-init C to zero before profiling next kernel b_device_buf.SetZero(); + // run for verification invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); if(do_verification) @@ -136,6 +135,7 @@ bool profile_transpose_impl(int do_verification, std::string op_name = op_ptr->GetTypeString(); + // run for timing purposes float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); @@ -153,10 +153,6 @@ bool profile_transpose_impl(int do_verification, std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl; - // pass = pass & ck::utils::check_err(b_device_result, b_host_result); - pass &= ck::utils::check_err( - b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); - if(tflops > best_tflops) { best_op_name = op_name; diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 5144785aa0..68ef04ed11 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -29,6 +29,7 @@ set(PROFILER_SOURCES profile_batchnorm_infer.cpp profile_grouped_conv_bwd_data.cpp profile_conv_tensor_rearrange.cpp + profile_transpose.cpp ) if(DL_KERNELS) @@ -91,6 +92,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp new file mode 100644 index 0000000000..d04c9fa2c4 --- /dev/null +++ b/profiler/src/profile_transpose.cpp @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "profiler/profile_transpose_impl.hpp" +#include "profiler_operation_registry.hpp" + +enum struct DataType +{ + F32_F32_F32_F32_F32, // 0 + F16_F16_F16_F16_F16, // 1 +}; + +#define OP_NAME "transpose" +#define OP_DESC "Transpose" + +struct TransposeArgParser +{ + std::unordered_map> long_opts = {{"lengths", {}}}; + + bool parse_opt(const int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + const int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +static void print_helper_msg() +{ + printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); + printf("arg2: data type (0: fp32; 1: fp16)\n"); + printf("arg3: verification (0: no; 1: yes)\n"); + printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg5: print tensor value (0: no; 1: yes)\n"); + printf("arg6: time kernel (0=no, 1=yes)\n"); + printf("arg7: --lengths: N, C, D, H, W\n"); +} + +int profile_transpose(int argc, char* argv[]) +{ + if(argc != 7) + { + print_helper_msg(); + exit(1); + } + TransposeArgParser arg_parser; + + const auto data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + arg_parser(argc, argv); + const std::vector lengths = arg_parser.long_opts["lengths"]; + + using F32 = float; + using F16 = ck::half_t; + + auto profile = [&](auto a_type, auto b_type) { + using ADataType = decltype(a_type); + using BDataType = decltype(b_type); + constexpr ck::index_t NumDim = 5; + + bool pass = ck::profiler::profile_transpose_impl( + do_verification, init_method, do_log, time_kernel, lengths); + + return pass ? 0 : 1; + }; + + if(data_type == DataType::F32_F32_F32_F32_F32) + { + return profile(F32{}, F32{}); + } + else if(data_type == DataType::F16_F16_F16_F16_F16) + { + return profile(F16{}, F16{}); + } + else + { + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; + } +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose); diff --git a/test/transpose/test_transpose.cpp b/test/transpose/test_transpose.cpp index 74991c62da..ead622b4d7 100644 --- a/test/transpose/test_transpose.cpp +++ b/test/transpose/test_transpose.cpp @@ -1,27 +1,35 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include - #include "gtest/gtest.h" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "test_transpose_util.hpp" +#include "profiler/profile_transpose_impl.hpp" using F16 = ck::half_t; using F32 = float; +using ck::index_t; template class TestTranspose : public ::testing::Test { + protected: + using ADataType = std::tuple_element_t<0, Tuple>; + using BDataType = std::tuple_element_t<1, Tuple>; + + void Run() + { + std::vector> lengths = { + {4, 16, 16, 32, 5}, {8, 16, 16, 32, 8} /**{32, 16, 16, 32, 8},**/}; + + for(auto length : lengths) + { + bool success = ck::profiler::profile_transpose_impl( + true, 2, false, false, length); + EXPECT_TRUE(success); + } + } }; -// clang-format off -using KernelTypes = ::testing::Types< - std::tuple< F16, F16>, - std::tuple< F32, F32> - >; -// clang-format on +using KernelTypes = ::testing::Types, std::tuple>; TYPED_TEST_SUITE(TestTranspose, KernelTypes); - -//#include "test_transpose_ut_cases.inc" +TYPED_TEST(TestTranspose, Test_FP16) { this->Run(); } +TYPED_TEST(TestTranspose, Test_FP32) { this->Run(); } diff --git a/test/transpose/test_transpose_ut_cases.inc b/test/transpose/test_transpose_ut_cases.inc deleted file mode 100644 index 59a2a6c72c..0000000000 --- a/test/transpose/test_transpose_ut_cases.inc +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -TYPED_TEST(TestTranspose, Test1) -{ - // for 16, 8, 16, 32, 8 - std::vector Ms{1, 2, 3, 4, 5, 6}; - std::vector lengths{16, 8, 16, 32, 8}; - /**constexpr int N = 16; - constexpr int C = 8; - constexpr int D = 16; - constexpr int H = 32; - constexpr int W = 8;**/ - - this->Run(); -} - -TYPED_TEST(TestTranpose, Test2) -{ - std::vector Ms{127, 255, 312, 799, 1573}; - std::vector lengths{16, 8, 16, 32, 16}; - /**constexpr int N = 16; - constexpr int C = 8; - constexpr int D = 16; - constexpr int H = 32; - constexpr int W = 8;**/ - - this->Run(); -} diff --git a/test/transpose/test_transpose_util.hpp b/test/transpose/test_transpose_util.hpp deleted file mode 100644 index 4bc25a6032..0000000000 --- a/test/transpose/test_transpose_util.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "include/ck/utility/data_type.hpp" -#include "profiler/profile_transpose_impl.hpp" - -namespace ck { -namespace test { - -template -class TestTranspose : public testing::Test -{ - using F32 = float; - - protected: - using ADataType = std::tuple_element_t<0, Tuple>; - using BDataType = std::tuple_element_t<1, Tuple>; - - public: - static constexpr bool verify_ = true; - static constexpr int init_method_ = 1; // decimal value initialization - static constexpr bool log_ = false; - static constexpr bool bench_ = false; // measure kernel performance - std::vector> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}}; - - void Run() - { - for(auto length : this->lengths_) - { - this->RunSingle(length); - } - } - - void RunSingle() - { - bool pass = ck::profiler::profile_transpose_impl( - verify_, init_method_, log_, bench_, lengths_); - EXPECT_TRUE(pass); - } -}; - -} // namespace test -} // namespace ck