Files
composable_kernel/test/ck_tile/batched_transpose/batched_transpose_api.cpp
Max Podkorytov 821cd26c13 [CK-Tile] Merge transpose examples (#2450)
* unify pipeline signature with existing example

* iwyu

* move stuff around in load-tile-transpose

* cleanups in batched transpose pipeline

* comments

* use same inputs size

* cleaner printf

* print host args

* use 64 block sides in the 37_transpose example

* roll back grid dimension size adjustment for 37_transpose example

* transpose grid for 37_transpose to unify with 35_batched_transpose

* unify grid computation logic

* make policy methods device only (since they are used only on device from the pipeline)

* more host/device attribute cleanups

* copy over problem

* move over pipeline and policy

* add switch to batched transpose api

* make the lds problem more similar to original problem

* factor out logic into traits

* factor out conditional compilation into trait parameter

* propagate pipeline to args

* unhardcode pipeline dispatch parameter

* refactor vector size

* put warp tile out of dispatch

* rename template parameter for trait

* rewrite vector size in terms of problem

* mark policy-internal struct variable as device

* factor out input distribution and thread access pattern from policies

* reword vector size

* use datatype across batched transpose pipelines, problems and kernel

* remove transpose traits from lds pipeline

* add padding to the lds pipeline *interface*

* add comment

* remove ck_tile example #37

* update cmakelists

* add test for new pipeline

* update batched transpose test

* roll back load_tile_transpose changes

* remove comments

* pack dispatch parameters into a config

* padM can be enabled

* adjust lds vector size to enable padding along N

* update test

* clean up logic

* swap m/n input vector size

* adjust perf test script

* sweep over C/W in perf test

* count both read and written bytes into bandwidth (x2 the number)

* clang-format

* widen size range for perf test

* remove 64k x 64k case; it's too large for index

* remove thread tile from dispatch

* Solve merge conflict

* fix compile

* modify the transpose

* solve the test error and clang format

* Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463)

* Add logging to IsSupported.

* Less casting in AddClamp

* Conv+bias+clamp instances & profiler BF16

* Fix 3D instances & run just 1x for verification.

* :Run just once for verification conv fwd.

* ckProfiler conv fwd clampwq

* Remove exec bit & formatting

* Add support for MultiD for grouped conv fwd v3.

* Enable 2Lds.

* clean

* align instances

* align instances

* profiler fixes

* Fixes

* fix

* fix

---------

Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Fixing 0ms and inf GB/s issue in img2col (#2565)

issue :
====
``` sh
$ bin/tile_example_img2col
Perf: 0 ms, inf GB/s
```

solution :
======
Problem occured because config.time_kernel is false by default.
if false, then no need to calculate perf, just print proper message

`image_to_coloumn: pass, No Perf generated due to config.time_kernel=0`

* merge with develop

* solve clang format

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
2025-07-26 21:51:54 -07:00

110 lines
3.6 KiB
C++

// Copyright © Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "batched_transpose.hpp"
template <typename ts_type,
ck_tile::index_t block_x,
ck_tile::index_t block_y,
ck_tile::index_t warp_x,
ck_tile::index_t warp_y,
bool kPadM,
bool kPadN>
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
{
uint32_t dim_stride = a.height * a.width;
a.dim_stride = dim_stride;
a.dim_block_h = block_y;
a.dim_block_w = block_x;
using block_tile = ck_tile::sequence<block_x, block_y>;
using warp_layout = ck_tile::sequence<warp_x, warp_y>;
using ts_problem =
ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
auto kargs = kernel::MakeKargs(a);
const dim3 grids = kernel::GridSize(a);
constexpr dim3 blocks = kernel::BlockSize();
printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
kargs.batch,
kargs.height,
kargs.width,
kargs.dim_stride);
printf("Launching Kernel...\n");
float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
printf("Kernel finished...\n");
return ave_time;
}
// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
#define FOREACH_TRANSPOSE_PARAM(F) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false)
// Macro that defines one static function per line
#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN) \
static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN( \
batched_transpose_kargs& a, ck_tile::stream_config& s) \
{ \
return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, PADM, PADN>(a, s); \
}
FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
float batched_transpose(batched_transpose_trait t,
batched_transpose_kargs a,
ck_tile::stream_config s)
{
if(t.type == "fp8")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp8_64_64_1_1_false_false(a, s);
}
else
{
return transpose_fn_fp8_64_64_1_1_true_true(a, s);
}
}
else if(t.type == "fp16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp16_64_64_1_1_false_false(a, s);
}
else
{
return transpose_fn_fp16_64_64_1_1_true_true(a, s);
}
}
else if(t.type == "bf16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_bf16_64_64_1_1_false_false(a, s);
}
else
{
return transpose_fn_bf16_64_64_1_1_true_true(a, s);
}
}
return -1;
}