mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
* add transpose load; no real logic
* fix some compile errors
* fix some issues
* update transpose load logic
* add some fixes
* fix a distribution issue
* update some codes
* add some fix
* can pass; but no logic
* transpose load enable
* update tile transpose
* miss output tile distribution mapping
* hack for transpose 16x16
* update output tensor distribution
* delete unused variables
* fix transpose related codes
* update transpose load example
* exchange the iteration order
* fix 16x16 related dimension transpose
* fix a transpose index issue
* fix a transpose index issue
* fix clang format check
* update load tile transpose related codes
* fix compile errors and pass 16x16 tests
* fix a typo
* update logic
* check other data types
* add transpose load api
* update transpose load api
* fix clang format check
* change file name
* refactor codes
* update code name
* delete some unused codes
* delete the unused oob flag for transpose load
* update tensor view api for transpose load
* update for testing
* fix a typo error
* move transpose ops to example directory
* update transpose api
* update include file
* fix for pr review
* fix compile errors
* add transpose load; no real logic
* fix some compile errors
* fix some issues
* update transpose load logic
* add some fixes
* fix a distribution issue
* update some codes
* add some fix
* can pass; but no logic
* transpose load enable
* update tile transpose
* miss output tile distribution mapping
* hack for transpose 16x16
* update output tensor distribution
* delete unused variables
* fix transpose related codes
* update transpose load example
* exchange the iteration order
* fix 16x16 related dimension transpose
* fix a transpose index issue
* fix a transpose index issue
* fix clang format check
* update load tile transpose related codes
* fix compile errors and pass 16x16 tests
* fix a typo
* update logic
* check other data types
* add transpose load api
* update transpose load api
* fix clang format check
* change file name
* refactor codes
* update code name
* delete some unused codes
* delete the unused oob flag for transpose load
* update tensor view api for transpose load
* update for testing
* fix a typo error
* move transpose ops to example directory
* update transpose api
* update include file
* fix for pr review
* fix compile errors
* change directory name
* delete the duplicated directory
* update cmakelists file
* delete the unused codes
* update function names
* update transpose policy
* update code after remod.py
* update codes
* add some comment
* Polish the instr infrastructure
* build up the fixed instr
* redesign the transpose api, currently it has numerical error
* add the bf16 transpose
* fix some issues
* add some comments
* update document
* Finished the refactor of API and pass through the verification
* fix the merging issue
---------
Co-authored-by: ThomasNing <thomas.ning@amd.com>
[ROCm/composable_kernel commit: a2f01141aa]
60 lines
2.1 KiB
C++
60 lines
2.1 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
#include "transpose_example.hpp"
|
|
#include <iostream>
|
|
|
|
template <typename ts_type,
|
|
ck_tile::index_t block_x,
|
|
ck_tile::index_t block_y,
|
|
ck_tile::index_t warp_x,
|
|
ck_tile::index_t warp_y>
|
|
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
|
|
{
|
|
uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
|
|
uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
|
|
uint32_t dim_stride = a.height * a.width;
|
|
|
|
a.dim_stride = dim_stride;
|
|
a.dim_block_h = dim_block_h;
|
|
a.dim_block_w = dim_block_w;
|
|
|
|
using ts_problem = ck_tile::TransposePipelineProblem<ts_type,
|
|
ck_tile::tensor_layout::gemm::RowMajor,
|
|
64,
|
|
1,
|
|
1,
|
|
block_y,
|
|
block_x,
|
|
warp_y,
|
|
warp_x>;
|
|
using ts_pipeline = ck_tile::BlockTranspose<ts_problem>;
|
|
|
|
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
|
|
|
|
auto kargs = kernel::MakeKargs(a);
|
|
|
|
const dim3 grids = kernel::GridSize(a);
|
|
constexpr dim3 blocks = kernel::BlockSize();
|
|
|
|
float ave_time = ck_tile::launch_kernel(
|
|
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
|
|
|
|
return ave_time;
|
|
}
|
|
|
|
float batched_transpose(batched_transpose_trait t,
|
|
batched_transpose_kargs a,
|
|
ck_tile::stream_config s)
|
|
{
|
|
if(t.type == "fp16")
|
|
{
|
|
return batched_transpose_dispatch<ck_tile::fp16_t, 16, 32, 16, 32>(a, s);
|
|
}
|
|
else if(t.type == "fp8")
|
|
{
|
|
return batched_transpose_dispatch<ck_tile::fp8_t, 16, 64, 16, 64>(a, s);
|
|
}
|
|
|
|
return -1;
|
|
}
|