[CK-Tile] Merge transpose examples (#2450)

* unify pipeline signature with existing example

* iwyu

* move stuff around in load-tile-transpose

* cleanups in batched transpose pipeline

* comments

* use same inputs size

* cleaner printf

* print host args

* use 64 block sides in the 37_transpose example

* roll back grid dimension size adjustment for 37_transpose example

* transpose grid for 37_transpose to unify with 35_batched_transpose

* unify grid computation logic

* make policy methods device only (since they are used only on device from the pipeline)

* more host/device attribute cleanups

* copy over problem

* move over pipeline and policy

* add switch to batched transpose api

* make the lds problem more similar to original problem

* factor out logic into traits

* factor out conditional compilation into trait parameter

* propagate pipeline to args

* unhardcode pipeline dispatch parameter

* refactor vector size

* put warp tile out of dispatch

* rename template parameter for trait

* rewrite vector size in terms of problem

* mark policy-internal struct variable as device

* factor out input distribution and thread access pattern from policies

* reword vector size

* use datatype across batched transpose pipelines, problems and kernel

* remove transpose traits from lds pipeline

* add padding to the lds pipeline *interface*

* add comment

* remove ck_tile example #37

* update cmakelists

* add test for new pipeline

* update batched transpose test

* roll back load_tile_transpose changes

* remove comments

* pack dispatch parameters into a config

* padM can be enabled

* adjust lds vector size to enable padding along N

* update test

* clean up logic

* swap m/n input vector size

* adjust perf test script

* sweep over C/W in perf test

* count both read and written bytes into bandwidth (x2 the number)

* clang-format

* widen size range for perf test

* remove 64k x 64k case; it's too large for index

* remove thread tile from dispatch

* Solve merge conflict

* fix compile

* modify the transpose

* solve the test error and clang format

* Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463)

* Add logging to IsSupported.

* Less casting in AddClamp

* Conv+bias+clamp instances & profiler BF16

* Fix 3D instances & run just 1x for verification.

* :Run just once for verification conv fwd.

* ckProfiler conv fwd clampwq

* Remove exec bit & formatting

* Add support for MultiD for grouped conv fwd v3.

* Enable 2Lds.

* clean

* align instances

* align instances

* profiler fixes

* Fixes

* fix

* fix

---------

Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Fixing 0ms and inf GB/s issue in img2col (#2565)

issue :
====
``` sh
$ bin/tile_example_img2col
Perf: 0 ms, inf GB/s
```

solution :
======
Problem occured because config.time_kernel is false by default.
if false, then no need to calculate perf, just print proper message

`image_to_coloumn: pass, No Perf generated due to config.time_kernel=0`

* merge with develop

* solve clang format

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
This commit is contained in:
Max Podkorytov
2025-07-26 21:51:54 -07:00
committed by GitHub
parent d2459878cf
commit 821cd26c13
24 changed files with 431 additions and 869 deletions

View File

@@ -2,41 +2,93 @@
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "batched_transpose_example.hpp"
template <typename ts_type,
ck_tile::index_t block_x,
ck_tile::index_t block_y,
ck_tile::index_t warp_x,
ck_tile::index_t warp_y,
ck_tile::index_t thread_x,
ck_tile::index_t thread_y,
bool kPadM,
bool kPadN>
namespace {
template <int32_t pipeline_id>
struct kernel_traits;
template <>
struct kernel_traits<0>
{
template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
using Problem =
ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
using Policy = ck_tile::BatchedTransposePolicy;
template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
using Pipeline =
ck_tile::BatchedTransposePipeline<Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
Policy>;
};
template <>
struct kernel_traits<1>
{
template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
using Problem =
ck_tile::BatchedTransposeLdsProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
using Policy = ck_tile::BatchedTransposeLdsPolicy;
template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
using Pipeline = ck_tile::BatchedTransposeLdsPipeline<
Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
Policy>;
};
} // namespace
template <typename InputType_,
ck_tile::index_t BlockX_,
ck_tile::index_t BlockY_,
ck_tile::index_t NumWarpsX_,
ck_tile::index_t NumWarpsY_,
bool PadM_,
bool PadN_,
ck_tile::index_t PipelineId_>
struct BatchedTransposeConfig
{
using InputType = InputType_;
static constexpr ck_tile::index_t kBlockX = BlockX_;
static constexpr ck_tile::index_t kBlockY = BlockY_;
static constexpr ck_tile::index_t kNumWarpsX = NumWarpsX_;
static constexpr ck_tile::index_t kNumWarpsY = NumWarpsY_;
static constexpr bool kPadM = PadM_;
static constexpr bool kPadN = PadN_;
static constexpr ck_tile::index_t kPipelineId = PipelineId_;
};
template <typename Config>
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
{
uint32_t dim_stride = a.height * a.width;
a.dim_stride = dim_stride;
a.dim_block_h = block_y;
a.dim_block_w = block_x;
a.dim_block_h = Config::kBlockY;
a.dim_block_w = Config::kBlockX;
using block_tile = ck_tile::sequence<block_x, block_y>;
using warp_tile = ck_tile::sequence<warp_x, warp_y>;
using thread_tile = ck_tile::sequence<thread_x, thread_y>;
using ts_problem =
ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
// TODO: this is fragile and slow to compile
using kernel = ck_tile::BatchedTransposeKernel<
typename kernel_traits<Config::kPipelineId>::template Pipeline<
typename Config::InputType,
ck_tile::sequence<Config::kBlockX, Config::kBlockY>,
ck_tile::sequence<Config::kNumWarpsX, Config::kNumWarpsY>,
Config::kPadM,
Config::kPadN>>;
auto kargs = kernel::MakeKargs(a);
const dim3 grids = kernel::GridSize(a);
constexpr dim3 blocks = kernel::BlockSize();
printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
printf("Pipeline: %d\n", Config::kPipelineId);
printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z);
printf("Block: x=%u y=%u z=%u\n", blocks.x, blocks.y, blocks.z);
printf(
"Host args: batch=%d, height=%d, width=%d, dim_stride=%d, dim_block_h=%d, dim_block_w=%d\n",
a.batch,
a.height,
a.width,
a.dim_stride,
a.dim_block_h,
a.dim_block_w);
printf("kargs: kargs.batch=%d kargs.height=%d kargs.width=%d kargs.dim_stride=%d\n",
kargs.batch,
kargs.height,
kargs.width,
@@ -52,22 +104,29 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
return ave_time;
}
// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
#define FOREACH_TRANSPOSE_PARAM(F) \
F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true) \
F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false) \
F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true) \
F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true) \
F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
// Param Comb: type_size, block_x & y, WarpNum_x & y
#define FOREACH_TRANSPOSE_PARAM(F) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 0) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 0) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 0) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 0) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 0) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 0) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 1) \
F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 1) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 1) \
F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 1) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 1) \
F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 1)
// Macro that defines one static function per line
#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN) \
static float \
transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN( \
batched_transpose_kargs& a, ck_tile::stream_config& s) \
{ \
return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE) \
static float \
transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN##_v##PIPE( \
batched_transpose_kargs& a, ck_tile::stream_config& s) \
{ \
return batched_transpose_dispatch< \
BatchedTransposeConfig<REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE>>(a, s); \
}
FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -76,38 +135,78 @@ float batched_transpose(batched_transpose_trait t,
batched_transpose_kargs a,
ck_tile::stream_config s)
{
if(t.type == "fp8")
if(t.pipeline == "0")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
if(t.type == "fp8")
{
return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp8_64_64_1_1_false_false_v0(a, s);
}
else
{
return transpose_fn_fp8_64_64_1_1_true_true_v0(a, s);
}
}
else
else if(t.type == "fp16")
{
return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp16_64_64_1_1_false_false_v0(a, s);
}
else
{
return transpose_fn_fp16_64_64_1_1_true_true_v0(a, s);
}
}
else if(t.type == "bf16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_bf16_64_64_1_1_false_false_v0(a, s);
}
else
{
return transpose_fn_bf16_64_64_1_1_true_true_v0(a, s);
}
}
}
else if(t.type == "fp16")
else if(t.pipeline == "1")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
if(t.type == "fp8")
{
return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp8_64_64_1_1_false_false_v1(a, s);
}
else
{
return transpose_fn_fp8_64_64_1_1_true_true_v1(a, s);
}
}
else
else if(t.type == "fp16")
{
return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
}
}
else if(t.type == "bf16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
}
else
{
return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp16_64_64_1_1_false_false_v1(a, s);
}
else
{
return transpose_fn_fp16_64_64_1_1_true_true_v1(a, s);
}
}
else if(t.type == "bf16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_bf16_64_64_1_1_false_false_v1(a, s);
}
else
{
return transpose_fn_bf16_64_64_1_1_true_true_v1(a, s);
}
}
}
return -1;
}

View File

@@ -102,7 +102,8 @@ auto create_args(int argc, char* argv[])
.insert("warmup", "50", "number of iterations before benchmark the kernel")
.insert("repeat", "100", "number of iterations to benchmark the kernel")
.insert("seed", "-1", "seed to be used, -1 means random every time")
.insert("kname", "0", "t to 1 will print kernel name");
.insert("kname", "0", "t to 1 will print kernel name")
.insert("pipeline", "0", "0: no LDS usage, 1: LDS-accelerated (gfx950)");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
@@ -121,6 +122,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
int n_repeat = args.get_int("repeat");
std::string layout_in = args.get_str("layout_in");
std::string layout_out = args.get_str("layout_out");
std::string pipeline = args.get_str("pipeline");
int seed = args.get_int("seed");
int dim_in[4], dim_out[4];
@@ -166,7 +168,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
x_dev.ToDevice(x_host.data());
auto trait = batched_transpose_trait{prec, layout_in};
auto trait = batched_transpose_trait{prec, layout_in, pipeline};
uint32_t height = nchw2nhwc ? C : H * W;
uint32_t width = nchw2nhwc ? H * W : C;
@@ -185,17 +187,15 @@ bool run_batched_transpose(ck_tile::ArgParser args)
auto ms = batched_transpose(trait, karg, sc);
std::size_t num_operations = N * C * H * (W - 1);
std::size_t num_bytes = N * C * H * W * sizeof(Type);
std::size_t num_bytes = N * C * H * W * sizeof(Type) * 2; // read + written
float ave_time = ms * 1E-3;
float gb_per_sec = num_bytes / ms * 1.E-6;
float tflops = static_cast<float>(num_operations) / ms * 1.E-6;
std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
<< ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
<< " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
<< gb_per_sec << " GB/s, " << std::endl;
<< " : " << std::endl
<< ms << " ms " << std::endl
<< gb_per_sec << " GB/s " << std::endl;
printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
prec.c_str(),

View File

@@ -14,6 +14,7 @@ struct batched_transpose_trait
{
std::string type;
std::string layout;
std::string pipeline;
};
struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs

View File

@@ -5,10 +5,14 @@
EXE=./build/bin/tile_example_batched_transpose
for C in "64" "256" "1024" "4096" "16384"; do
for W in "64" "256" "1024" "4096" "16384"; do
for pr in "fp8" "fp16" "bf16"; do
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
for pipeline in "0" "1"; do
$EXE -pipeline=$pipeline -pr=$pr -N=1 -C=$C -H=1 -W=$W -layout_in='NCHW' -layout_out='NHWC'
done
done
done
done

View File

@@ -6,25 +6,27 @@
EXE=./build/bin/tile_example_batched_transpose
for pr in "fp8" "fp16" "bf16"; do
$EXE -pr=$pr -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
for pipeline in "0" "1"; do
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
done
done