Vectorized Transpose for Batched Transpose CK Tile Operator (#2131)

* Shared Memory for single data point

* CKTile Transpose vectorize CP1

* CKTile Transpose vectorize CP2

* CKTile Transpose vectorize CP2.1

* fixed the compile error of the transpose tile 2d

* Have the correct result for the current test sample

* Changes to printing tensor

* fp8 support added

* Debugging for transpose

* solving the corner issue

* Changed padding flag

* Intermideate Debugging

* Intermidiate Debugging

* Intermediate Debugging

* Finished debugging of the transpose op

* Code Cleanup

* Adding edge case smoke tests

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Addressing Review Comment

* Addressing Comments

* Addressing Comments

* Measuring Perf Tests

* Code Cleanup

* Changlog

* Added the running iterations

* clang format

* Fix the changelog

* Fix the compilation error

* change the printing factor

---------

Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
This commit is contained in:
Thomas Ning
2025-05-12 00:41:45 -07:00
committed by GitHub
parent d8faf1c6a1
commit 9d1e44e56a
14 changed files with 311 additions and 152 deletions

View File

@@ -384,22 +384,6 @@ struct tensor_view
coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
}
CK_TILE_HOST_DEVICE void print() const
{
printf("tensor_view{");
// buf_
printf("buf_: ");
print(buf_);
printf(", ");
// desc_
printf("desc_: ");
print(desc_);
printf("}");
}
// member
buffer_view buf_;
TensorDesc desc_;
@@ -494,6 +478,7 @@ template <typename TensorView,
CK_TILE_HOST_DEVICE constexpr auto
pad_tensor_view(const TensorView& tensor_view, const TileLengths& tile_lengths, DoPads)
{
constexpr index_t num_dim = DoPads::size();
static_assert(num_dim == TileLengths::size() && num_dim == TensorView::get_num_of_dimension(),

View File

@@ -85,7 +85,12 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
// SFC
constexpr auto scalars_per_access_arr = generate_array(
[&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
[&](auto i) {
if constexpr(vec_length_in == 1)
return 1;
else
return (i == y_dim_vec_in || i == y_dim_vec_out) ? y_lengths[i] : 1;
},
number<NDimY>{});
constexpr auto scalars_per_access = TO_SEQUENCE(scalars_per_access_arr, NDimY);
@@ -103,13 +108,19 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
// loop over SFC
static_for<0, num_access, 1>{}([&](auto iAccess) {
// data index [y0, y1, ...] in the order of input tensor
constexpr auto idx_y = SFC_Y::get_index(iAccess);
constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y);
constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);
constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
constexpr auto idx_y_in =
generate_tuple([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
static_assert(in_offset % vec_length_in == 0);
constexpr auto idx_y_out_tmp =
generate_array([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
constexpr auto idx_y_out =
container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
if constexpr(vec_length_in == 1)
{
out_tensor.get_thread_buffer()[number<out_offset>{}] =
in_tensor.get_thread_buffer()[number<in_offset>{}];
}