Vectorized Transpose for Batched Transpose CK Tile Operator (#2131)

* Shared Memory for single data point

* CKTile Transpose vectorize CP1

* CKTile Transpose vectorize CP2

* CKTile Transpose vectorize CP2.1

* fixed the compile error of the transpose tile 2d

* Have the correct result for the current test sample

* Changes to printing tensor

* fp8 support added

* Debugging for transpose

* solving the corner issue

* Changed padding flag

* Intermideate Debugging

* Intermidiate Debugging

* Intermediate Debugging

* Finished debugging of the transpose op

* Code Cleanup

* Adding edge case smoke tests

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Addressing Review Comment

* Addressing Comments

* Addressing Comments

* Measuring Perf Tests

* Code Cleanup

* Changlog

* Added the running iterations

* clang format

* Fix the changelog

* Fix the compilation error

* change the printing factor

---------

Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
This commit is contained in:
Thomas Ning
2025-05-12 00:41:45 -07:00
committed by GitHub
parent d8faf1c6a1
commit 9d1e44e56a
14 changed files with 311 additions and 152 deletions

View File

@@ -24,4 +24,6 @@ args:
-layout_out output tensor data layout - NHWC by default
-seed seed to be used, -1 means random every time (default:-1)
-k_name t to 1 will print kernel name (default:0)
-warmup warmup iterations to run this kernel (default:50)
-repeat number of iterations to run this kernel (default:100)
```

View File

@@ -1,7 +1,6 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "batched_transpose_example.hpp"
#include <iostream>
template <typename ts_type,
ck_tile::index_t block_x,
@@ -9,23 +8,23 @@ template <typename ts_type,
ck_tile::index_t warp_x,
ck_tile::index_t warp_y,
ck_tile::index_t thread_x,
ck_tile::index_t thread_y>
ck_tile::index_t thread_y,
bool kPadM,
bool kPadN>
float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
{
uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
uint32_t dim_stride = a.height * a.width;
uint32_t dim_stride = a.height * a.width;
a.dim_stride = dim_stride;
a.dim_block_h = dim_block_h;
a.dim_block_w = dim_block_w;
a.dim_block_h = block_y;
a.dim_block_w = block_x;
using block_tile = ck_tile::sequence<block_x, block_y>;
using warp_tile = ck_tile::sequence<warp_x, warp_y>;
using thread_tile = ck_tile::sequence<thread_x, thread_y>;
using ts_problem =
ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile>;
ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
@@ -35,25 +34,40 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
const dim3 grids = kernel::GridSize(a);
constexpr dim3 blocks = kernel::BlockSize();
printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
kargs.batch,
kargs.height,
kargs.width,
kargs.dim_stride);
printf("Launching Kernel...\n");
float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
printf("Kernel finished...\n");
return ave_time;
}
// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
#define FOREACH_TRANSPOSE_PARAM(F) \
F(fp16, ck_tile::fp16_t, 16, 16, 8, 8, 1, 1) \
F(bf16, ck_tile::bf16_t, 16, 16, 8, 8, 1, 1) \
F(fp32, ck_tile::fp32_t, 16, 16, 8, 8, 1, 1) \
F(int8, ck_tile::int8_t, 16, 16, 8, 8, 1, 1)
#define FOREACH_TRANSPOSE_PARAM(F) \
F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true) \
F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false) \
F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true) \
F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true) \
F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
// Macro that defines one static function per line
#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY) \
static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY( \
batched_transpose_kargs& a, ck_tile::stream_config& s) \
{ \
return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY>(a, s); \
#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN) \
static float \
transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN( \
batched_transpose_kargs& a, ck_tile::stream_config& s) \
{ \
return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
}
FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -62,21 +76,38 @@ float batched_transpose(batched_transpose_trait t,
batched_transpose_kargs a,
ck_tile::stream_config s)
{
if(t.type == "fp16")
if(t.type == "fp8")
{
return transpose_fn_fp16_16_16_8_8_1_1(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
}
else
{
return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
}
}
else if(t.type == "fp16")
{
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
}
else
{
return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
}
}
else if(t.type == "bf16")
{
return transpose_fn_bf16_16_16_8_8_1_1(a, s);
}
else if(t.type == "fp32")
{
return transpose_fn_fp32_16_16_8_8_1_1(a, s);
}
else if(t.type == "int8")
{
return transpose_fn_int8_16_16_8_8_1_1(a, s);
if(a.height % 64 == 0 && a.width % 64 == 0)
{
return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
}
else
{
return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
}
}
return -1;
}

View File

@@ -21,13 +21,13 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
std::cout << "[";
for(size_t i = 0; i < len[0]; i++)
{
std::cout << i << ": [";
std::cout << "Batch " << i << ":" << std::endl;
for(size_t j = 0; j < len[1]; j++)
{
std::cout << j << ": [";
std::cout << " Channel " << j << ":" << std::endl;
for(size_t k = 0; k < len[2]; k++)
{
std::cout << k << ": [";
std::cout << " Row " << k << ": ";
for(size_t v = 0; v < len[3]; v++)
{
if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
@@ -41,15 +41,15 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
}
else
{
std::cout << x(std::vector<std::size_t>{i, j, k, v}) << " ";
std::cout << static_cast<int>(x(std::vector<std::size_t>{i, j, k, v}))
<< " ";
}
}
std::cout << "]" << std::endl;
std::cout << std::endl;
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
std::cout << "]" << std::endl;
std::cout << "--------------------" << std::endl;
}
#endif
@@ -93,12 +93,14 @@ auto create_args(int argc, char* argv[])
ck_tile::ArgParser arg_parser;
arg_parser.insert("v", "1", "whether do CPU validation or not")
.insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
.insert("N", "2", "input batch size. ")
.insert("C", "16", "input channel size.")
.insert("H", "1", "input height size.")
.insert("W", "16", "input width size. ")
.insert("N", "1", "input batch size. ")
.insert("C", "64", "input channel size.")
.insert("H", "18", "input height size.")
.insert("W", "64", "input width size. ")
.insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
.insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
.insert("warmup", "50", "number of iterations before benchmark the kernel")
.insert("repeat", "100", "number of iterations to benchmark the kernel")
.insert("seed", "-1", "seed to be used, -1 means random every time")
.insert("kname", "0", "t to 1 will print kernel name");
@@ -115,6 +117,8 @@ bool run_batched_transpose(ck_tile::ArgParser args)
int C = args.get_int("C");
int H = args.get_int("H");
int W = args.get_int("W");
int n_warmup = args.get_int("warmup");
int n_repeat = args.get_int("repeat");
std::string layout_in = args.get_str("layout_in");
std::string layout_out = args.get_str("layout_out");
int seed = args.get_int("seed");
@@ -177,7 +181,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
return a_;
}();
ck_tile::stream_config sc{nullptr, true};
ck_tile::stream_config sc{nullptr, true, n_warmup, n_repeat};
auto ms = batched_transpose(trait, karg, sc);
@@ -202,7 +206,8 @@ bool run_batched_transpose(ck_tile::ArgParser args)
layout_in.c_str(),
ms);
if(ms < 0)
printf("not supported\n");
printf("------------------------------------not "
"supported-------------------------------------\n");
fflush(stdout);
if(ms < 0)
@@ -227,7 +232,9 @@ bool run_batched_transpose(ck_tile::ArgParser args)
rtn &= ck_tile::check_err(
y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
}
printf("valid:%s\n", rtn ? "y" : "n");
printf("-----------------------------------------------------------------------valid:%s--------"
"--------------------------------------------------------------------\n",
rtn ? "y" : "n");
fflush(stdout);
return rtn;
}
@@ -240,9 +247,9 @@ int main(int argc, char** argv)
std::string prec = args.get_str("pr");
bool r = true;
if(prec.compare("fp32") == 0)
if(prec.compare("fp8") == 0)
{
r &= run_batched_transpose<float>(args);
r &= run_batched_transpose<ck_tile::fp8_t>(args);
}
else if(prec.compare("fp16") == 0)
{
@@ -252,10 +259,6 @@ int main(int argc, char** argv)
{
r &= run_batched_transpose<ck_tile::bf16_t>(args);
}
else if(prec.compare("int8") == 0)
{
r &= run_batched_transpose<ck_tile::int8_t>(args);
}
return r ? 0 : -1;
}

View File

@@ -0,0 +1,11 @@
#!/bin/sh
EXE=./build/bin/tile_example_batched_transpose
for pr in "fp8" "fp16" "bf16"; do
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
done

View File

@@ -0,0 +1,38 @@
#!/bin/bash
#
# in order to run this script you'd first need to build the tile_example_batched_transpose executables in ../build/bin/
#
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
# input arguments:
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# host name : $hostname
# gpu architecture: e.g., gfx90a, or gfx942, etc.
#get the command line arguments:
export env_type=$1
echo 'Environment type: ' $env_type
export branch=$2
echo 'Branch name: ' $branch
export host_name=$3
echo 'Host name: ' $host_name
export GPU_arch=$4
echo 'GPU_arch: ' $GPU_arch
function print_log_header(){
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run verification tests
example/ck_tile/35_batched_transpose/script/smoke_test.sh
#run performance benchmarks

View File

@@ -2,10 +2,26 @@
EXE=./build/bin/tile_example_batched_transpose
for pr in "fp32" "fp16" "int8" ; do
for pr in "fp8" "fp16" "bf16"; do
$EXE -pr=$pr -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
done
$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
$EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
$EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
done