transpose load api development (#2177)

* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* change directory name

* delete the duplicated directory

* update cmakelists file

* delete the unused codes

* update function names

* update transpose policy

* update code after remod.py

* update codes

* add some comment

* Polish the instr infrastructure

* build up the fixed instr

* redesign the transpose api, currently it has numerical error

* add the bf16 transpose

* fix some issues

* add some comments

* update document

* Finished the refactor of API and pass through the verification

* fix the merging issue

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
This commit is contained in:
joyeamd
2025-06-18 16:28:34 +08:00
committed by GitHub
parent 64a2fda713
commit a2f01141aa
17 changed files with 1523 additions and 1 deletions

View File

@@ -2784,6 +2784,40 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
#endif
}
template <typename T, index_t N, address_space_enum BufferAddressSpace>
__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
{
if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
{
typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
__attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr =
reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>(
reinterpret_cast<uintptr_t>(in_ptr));
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
}
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
{
typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
__attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr =
reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>(
reinterpret_cast<uintptr_t>(in_ptr));
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
}
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t>)
{
typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t;
__attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr =
reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>(
reinterpret_cast<uintptr_t>(in_ptr));
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
}
else
{
static_assert(false, "not implemented");
}
}
} // namespace ck_tile
#endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN

View File

@@ -0,0 +1,86 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/container/sequence.hpp"
#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
namespace ck_tile {
// this generate wave level tile distribution
template <typename T, typename = void>
struct LaneGroupTransposeTraits;
template <typename T>
struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 2>>
{
// before transpose, 4x16
static constexpr index_t ksecondDim = 4;
static constexpr index_t kleadDim = 16;
// after transpose, 16x4
static constexpr index_t ksecondDimT = 16;
static constexpr index_t kleadDimT = 4;
template <index_t kOuterDistDim0,
index_t kOuterDistDim1,
index_t kInnerDistDim0,
index_t kInnerDistDim1>
using TileDistribution =
tile_distribution_encoding<sequence<>,
tuple<sequence<kOuterDistDim0, kOuterDistDim1, 4>,
sequence<kInnerDistDim0, kInnerDistDim1, 4, 4>>,
tuple<sequence<1, 2, 1, 2>>,
tuple<sequence<0, 0, 2, 2>>,
sequence<2, 1, 2>,
sequence<1, 1, 3>>;
};
template <typename T>
struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 1>>
{
static constexpr index_t ksecondDim = 8;
static constexpr index_t kleadDim = 16;
static constexpr index_t ksecondDimT = 16;
static constexpr index_t kleadDimT = 8;
template <index_t kOuterDistDim0,
index_t kOuterDistDim1,
index_t kInnerDistDim0,
index_t kInnerDistDim1>
using TileDistribution =
tile_distribution_encoding<sequence<>,
tuple<sequence<kOuterDistDim0, kOuterDistDim1, 8>,
sequence<kInnerDistDim0, kInnerDistDim1, 2, 8>>,
tuple<sequence<1, 2, 1, 2>>,
tuple<sequence<0, 0, 2, 2>>,
sequence<2, 1, 2>,
sequence<1, 1, 3>>;
};
/*
* @brief This function is used to generate the transposed distribution encoding
* for the given data type and distribution dimensions.
*
* @tparam T The data type of the elements in the tensor.
* @tparam kOuterDistDim0 The outer distribution dimension 0, which is outer dimension for stride.
* @tparam kOuterDistDim1 The outer distribution dimension 1, which is inner dimension for stride.
* @tparam kInnerDistDim0 The inner distribution dimension 0, which is outer dimension for
* consecutive.
* @tparam kInnerDistDim1 The inner distribution dimension 1, which is inner dimension for
* consecutive.
*/
template <typename T,
index_t kOuterDistDim0,
index_t kOuterDistDim1,
index_t kInnerDistDim0,
index_t kInnerDistDim1>
CK_TILE_DEVICE constexpr auto make_transposed_distr_encode()
{
using xdllevel_dstr_encoding = typename LaneGroupTransposeTraits<T>::
template TileDistribution<kOuterDistDim0, kOuterDistDim1, kInnerDistDim0, kInnerDistDim1>;
return xdllevel_dstr_encoding{};
}
} // namespace ck_tile