Transpose 3d (#984)

* added working example for 5D input using 1D kernel

* example with 5D input tensor and 2d kernel - not working: issues with arguments

* added updated version of 3d device op - changed descriptors/dims

* added example file to check kernel

* fixed descriptor and isSupportedArgument stride problem

* added and modified kernel for 3d - updated tids/loop

* adding some more 5d example files

* fixed some issues

* changes made for testing

* working version: fixed error in stride for A, still a bit inefficient

* cleaned up formatting/comments

* updating formatting

* more formatting fixes

* fixing cmake, adding back gpu targets in cmake script

* adding client example

* added instances for client example

* fixed errors in client example

* implemented client ex with device_elementwise.hpp and device_elementwise_3d_impl.hpp

* removed extra files

* minor formatting and naming fixes

* adding test files and profiler

* fixing minor error

* minor fix

* removed unneccesary comments, renamed files

* updated instance list for client example, added different layout example

* removing instances

* fixed error in instance generation

* remove comments

* update profiler and client example tensor layouts

* fixed errors in test/profiler

* updated vector dim access to enable vector load

* updated test/profiler files

* updated example with 1d kernel

* updating profiler

* renamed files

---------

Co-authored-by: Jing Zhang <jizha@amd.com>

[ROCm/composable_kernel commit: 3af8c81a72]
This commit is contained in:
arai713
2023-11-08 17:45:07 -08:00
committed by GitHub
parent 867bc90509
commit cba606adf6
20 changed files with 1582 additions and 27 deletions

View File

@@ -148,6 +148,7 @@ add_subdirectory(pool)
add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data)
add_subdirectory(conv_tensor_rearrange)
add_subdirectory(transpose)
if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op)
endif()

View File

@@ -0,0 +1,9 @@
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_gtest_executable(test_transpose test_transpose.cpp)
target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
set(target 1)
endif()
endforeach()

View File

@@ -0,0 +1,27 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_transpose_util.hpp"
using F16 = ck::half_t;
using F32 = float;
template <typename Tuple>
class TestTranspose : public ::testing::Test
{
};
// clang-format off
using KernelTypes = ::testing::Types<
std::tuple< F16, F16>,
std::tuple< F32, F32>
>;
// clang-format on
TYPED_TEST_SUITE(TestTranspose, KernelTypes);
//#include "test_transpose_ut_cases.inc"

View File

@@ -0,0 +1,30 @@
#pragma once
TYPED_TEST(TestTranspose, Test1)
{
// for 16, 8, 16, 32, 8
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
std::vector<index_t> lengths{16, 8, 16, 32, 8};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}
TYPED_TEST(TestTranpose, Test2)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
std::vector<index_t> lengths{16, 8, 16, 32, 16};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}

View File

@@ -0,0 +1,54 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/data_type.hpp"
#include "profiler/profile_transpose_impl.hpp"
namespace ck {
namespace test {
template <typename Tuple>
class TestTranspose : public testing::Test
{
using F32 = float;
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1; // decimal value initialization
static constexpr bool log_ = false;
static constexpr bool bench_ = false; // measure kernel performance
std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
void Run()
{
for(auto length : this->lengths_)
{
this->RunSingle(length);
}
}
void RunSingle()
{
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
verify_, init_method_, log_, bench_, lengths_);
EXPECT_TRUE(pass);
}
};
} // namespace test
} // namespace ck