mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 04:31:25 +00:00
transpose load api development (#2177)
* add transpose load; no real logic * fix some compile errors * fix some issues * update transpose load logic * add some fixes * fix a distribution issue * update some codes * add some fix * can pass; but no logic * transpose load enable * update tile transpose * miss output tile distribution mapping * hack for transpose 16x16 * update output tensor distribution * delete unused variables * fix transpose related codes * update transpose load example * exchange the iteration order * fix 16x16 related dimension transpose * fix a transpose index issue * fix a transpose index issue * fix clang format check * update load tile transpose related codes * fix compile errors and pass 16x16 tests * fix a typo * update logic * check other data types * add transpose load api * update transpose load api * fix clang format check * change file name * refactor codes * update code name * delete some unused codes * delete the unused oob flag for transpose load * update tensor view api for transpose load * update for testing * fix a typo error * move transpose ops to example directory * update transpose api * update include file * fix for pr review * fix compile errors * add transpose load; no real logic * fix some compile errors * fix some issues * update transpose load logic * add some fixes * fix a distribution issue * update some codes * add some fix * can pass; but no logic * transpose load enable * update tile transpose * miss output tile distribution mapping * hack for transpose 16x16 * update output tensor distribution * delete unused variables * fix transpose related codes * update transpose load example * exchange the iteration order * fix 16x16 related dimension transpose * fix a transpose index issue * fix a transpose index issue * fix clang format check * update load tile transpose related codes * fix compile errors and pass 16x16 tests * fix a typo * update logic * check other data types * add transpose load api * update transpose load api * fix clang format check * change file name * refactor codes * update code name * delete some unused codes * delete the unused oob flag for transpose load * update tensor view api for transpose load * update for testing * fix a typo error * move transpose ops to example directory * update transpose api * update include file * fix for pr review * fix compile errors * change directory name * delete the duplicated directory * update cmakelists file * delete the unused codes * update function names * update transpose policy * update code after remod.py * update codes * add some comment * Polish the instr infrastructure * build up the fixed instr * redesign the transpose api, currently it has numerical error * add the bf16 transpose * fix some issues * add some comments * update document * Finished the refactor of API and pass through the verification * fix the merging issue --------- Co-authored-by: ThomasNing <thomas.ning@amd.com>
This commit is contained in:
59
example/ck_tile/37_transpose/transpose_api.cpp
Normal file
59
example/ck_tile/37_transpose/transpose_api.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#include "transpose_example.hpp"
|
||||
#include <iostream>
|
||||
|
||||
template <typename ts_type,
|
||||
ck_tile::index_t block_x,
|
||||
ck_tile::index_t block_y,
|
||||
ck_tile::index_t warp_x,
|
||||
ck_tile::index_t warp_y>
|
||||
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
|
||||
{
|
||||
uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
|
||||
uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
|
||||
uint32_t dim_stride = a.height * a.width;
|
||||
|
||||
a.dim_stride = dim_stride;
|
||||
a.dim_block_h = dim_block_h;
|
||||
a.dim_block_w = dim_block_w;
|
||||
|
||||
using ts_problem = ck_tile::TransposePipelineProblem<ts_type,
|
||||
ck_tile::tensor_layout::gemm::RowMajor,
|
||||
64,
|
||||
1,
|
||||
1,
|
||||
block_y,
|
||||
block_x,
|
||||
warp_y,
|
||||
warp_x>;
|
||||
using ts_pipeline = ck_tile::BlockTranspose<ts_problem>;
|
||||
|
||||
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
|
||||
|
||||
auto kargs = kernel::MakeKargs(a);
|
||||
|
||||
const dim3 grids = kernel::GridSize(a);
|
||||
constexpr dim3 blocks = kernel::BlockSize();
|
||||
|
||||
float ave_time = ck_tile::launch_kernel(
|
||||
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float batched_transpose(batched_transpose_trait t,
|
||||
batched_transpose_kargs a,
|
||||
ck_tile::stream_config s)
|
||||
{
|
||||
if(t.type == "fp16")
|
||||
{
|
||||
return batched_transpose_dispatch<ck_tile::fp16_t, 16, 32, 16, 32>(a, s);
|
||||
}
|
||||
else if(t.type == "fp8")
|
||||
{
|
||||
return batched_transpose_dispatch<ck_tile::fp8_t, 16, 64, 16, 64>(a, s);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
Reference in New Issue
Block a user