mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-24 06:44:36 +00:00
Vectorized Transpose for Batched Transpose CK Tile Operator (#2131)
* Shared Memory for single data point
* CKTile Transpose vectorize CP1
* CKTile Transpose vectorize CP2
* CKTile Transpose vectorize CP2.1
* fixed the compile error of the transpose tile 2d
* Have the correct result for the current test sample
* Changes to printing tensor
* fp8 support added
* Debugging for transpose
* solving the corner issue
* Changed padding flag
* Intermideate Debugging
* Intermidiate Debugging
* Intermediate Debugging
* Finished debugging of the transpose op
* Code Cleanup
* Adding edge case smoke tests
* Adding Transpose test to CI/CD
* Adding Transpose test to CI/CD
* Adding Transpose test to CI/CD
* Addressing Review Comment
* Addressing Comments
* Addressing Comments
* Measuring Perf Tests
* Code Cleanup
* Changlog
* Added the running iterations
* clang format
* Fix the changelog
* Fix the compilation error
* change the printing factor
---------
Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
[ROCm/composable_kernel commit: 9d1e44e56a]
This commit is contained in:
@@ -384,22 +384,6 @@ struct tensor_view
|
||||
coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE void print() const
|
||||
{
|
||||
printf("tensor_view{");
|
||||
|
||||
// buf_
|
||||
printf("buf_: ");
|
||||
print(buf_);
|
||||
printf(", ");
|
||||
|
||||
// desc_
|
||||
printf("desc_: ");
|
||||
print(desc_);
|
||||
|
||||
printf("}");
|
||||
}
|
||||
|
||||
// member
|
||||
buffer_view buf_;
|
||||
TensorDesc desc_;
|
||||
@@ -494,6 +478,7 @@ template <typename TensorView,
|
||||
CK_TILE_HOST_DEVICE constexpr auto
|
||||
pad_tensor_view(const TensorView& tensor_view, const TileLengths& tile_lengths, DoPads)
|
||||
{
|
||||
|
||||
constexpr index_t num_dim = DoPads::size();
|
||||
|
||||
static_assert(num_dim == TileLengths::size() && num_dim == TensorView::get_num_of_dimension(),
|
||||
|
||||
@@ -85,7 +85,12 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
|
||||
|
||||
// SFC
|
||||
constexpr auto scalars_per_access_arr = generate_array(
|
||||
[&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
|
||||
[&](auto i) {
|
||||
if constexpr(vec_length_in == 1)
|
||||
return 1;
|
||||
else
|
||||
return (i == y_dim_vec_in || i == y_dim_vec_out) ? y_lengths[i] : 1;
|
||||
},
|
||||
number<NDimY>{});
|
||||
|
||||
constexpr auto scalars_per_access = TO_SEQUENCE(scalars_per_access_arr, NDimY);
|
||||
@@ -103,13 +108,19 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
|
||||
// loop over SFC
|
||||
static_for<0, num_access, 1>{}([&](auto iAccess) {
|
||||
// data index [y0, y1, ...] in the order of input tensor
|
||||
constexpr auto idx_y = SFC_Y::get_index(iAccess);
|
||||
|
||||
constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y);
|
||||
constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);
|
||||
|
||||
constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
|
||||
constexpr auto idx_y_in =
|
||||
generate_tuple([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
|
||||
constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
|
||||
static_assert(in_offset % vec_length_in == 0);
|
||||
constexpr auto idx_y_out_tmp =
|
||||
generate_array([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
|
||||
constexpr auto idx_y_out =
|
||||
container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
|
||||
constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
|
||||
if constexpr(vec_length_in == 1)
|
||||
{
|
||||
|
||||
out_tensor.get_thread_buffer()[number<out_offset>{}] =
|
||||
in_tensor.get_thread_buffer()[number<in_offset>{}];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user