Files
composable_kernel/example/ck_tile/37_transpose/transpose_api.cpp
joyeamd a2f01141aa transpose load api development (#2177)
* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* change directory name

* delete the duplicated directory

* update cmakelists file

* delete the unused codes

* update function names

* update transpose policy

* update code after remod.py

* update codes

* add some comment

* Polish the instr infrastructure

* build up the fixed instr

* redesign the transpose api, currently it has numerical error

* add the bf16 transpose

* fix some issues

* add some comments

* update document

* Finished the refactor of API and pass through the verification

* fix the merging issue

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
2025-06-18 01:28:34 -07:00

60 lines
2.1 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "transpose_example.hpp"
#include <iostream>
template <typename ts_type,
ck_tile::index_t block_x,
ck_tile::index_t block_y,
ck_tile::index_t warp_x,
ck_tile::index_t warp_y>
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
{
uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
uint32_t dim_stride = a.height * a.width;
a.dim_stride = dim_stride;
a.dim_block_h = dim_block_h;
a.dim_block_w = dim_block_w;
using ts_problem = ck_tile::TransposePipelineProblem<ts_type,
ck_tile::tensor_layout::gemm::RowMajor,
64,
1,
1,
block_y,
block_x,
warp_y,
warp_x>;
using ts_pipeline = ck_tile::BlockTranspose<ts_problem>;
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
auto kargs = kernel::MakeKargs(a);
const dim3 grids = kernel::GridSize(a);
constexpr dim3 blocks = kernel::BlockSize();
float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
return ave_time;
}
float batched_transpose(batched_transpose_trait t,
batched_transpose_kargs a,
ck_tile::stream_config s)
{
if(t.type == "fp16")
{
return batched_transpose_dispatch<ck_tile::fp16_t, 16, 32, 16, 32>(a, s);
}
else if(t.type == "fp8")
{
return batched_transpose_dispatch<ck_tile::fp8_t, 16, 64, 16, 64>(a, s);
}
return -1;
}