[CK-Tile] Merge transpose examples (#2450)

* unify pipeline signature with existing example * iwyu * move stuff around in load-tile-transpose * cleanups in batched transpose pipeline * comments * use same inputs size * cleaner printf * print host args * use 64 block sides in the 37_transpose example * roll back grid dimension size adjustment for 37_transpose example * transpose grid for 37_transpose to unify with 35_batched_transpose * unify grid computation logic * make policy methods device only (since they are used only on device from the pipeline) * more host/device attribute cleanups * copy over problem * move over pipeline and policy * add switch to batched transpose api * make the lds problem more similar to original problem * factor out logic into traits * factor out conditional compilation into trait parameter * propagate pipeline to args * unhardcode pipeline dispatch parameter * refactor vector size * put warp tile out of dispatch * rename template parameter for trait * rewrite vector size in terms of problem * mark policy-internal struct variable as device * factor out input distribution and thread access pattern from policies * reword vector size * use datatype across batched transpose pipelines, problems and kernel * remove transpose traits from lds pipeline * add padding to the lds pipeline *interface* * add comment * remove ck_tile example #37 * update cmakelists * add test for new pipeline * update batched transpose test * roll back load_tile_transpose changes * remove comments * pack dispatch parameters into a config * padM can be enabled * adjust lds vector size to enable padding along N * update test * clean up logic * swap m/n input vector size * adjust perf test script * sweep over C/W in perf test * count both read and written bytes into bandwidth (x2 the number) * clang-format * widen size range for perf test * remove 64k x 64k case; it's too large for index * remove thread tile from dispatch * Solve merge conflict * fix compile * modify the transpose * solve the test error and clang format * Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463) * Add logging to IsSupported. * Less casting in AddClamp * Conv+bias+clamp instances & profiler BF16 * Fix 3D instances & run just 1x for verification. * :Run just once for verification conv fwd. * ckProfiler conv fwd clampwq * Remove exec bit & formatting * Add support for MultiD for grouped conv fwd v3. * Enable 2Lds. * clean * align instances * align instances * profiler fixes * Fixes * fix * fix --------- Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Fixing 0ms and inf GB/s issue in img2col (#2565) issue : ==== ``` sh $ bin/tile_example_img2col Perf: 0 ms, inf GB/s ``` solution : ====== Problem occured because config.time_kernel is false by default. if false, then no need to calculate perf, just print proper message `image_to_coloumn: pass, No Perf generated due to config.time_kernel=0` * merge with develop * solve clang format --------- Co-authored-by: ThomasNing <thomas.ning@amd.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
2026-04-20 06:49:15 +00:00 · 2025-07-26 21:51:54 -07:00
parent d2459878cf
commit 821cd26c13
24 changed files with 431 additions and 869 deletions
--- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
@@ -2,41 +2,93 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "batched_transpose_example.hpp"

-template <typename ts_type,
-          ck_tile::index_t block_x,
-          ck_tile::index_t block_y,
-          ck_tile::index_t warp_x,
-          ck_tile::index_t warp_y,
-          ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y,
-          bool kPadM,
-          bool kPadN>
+namespace {
+
+template <int32_t pipeline_id>
+struct kernel_traits;
+
+template <>
+struct kernel_traits<0>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposePolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline =
+        ck_tile::BatchedTransposePipeline<Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+                                          Policy>;
+};
+
+template <>
+struct kernel_traits<1>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeLdsProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposeLdsPolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline = ck_tile::BatchedTransposeLdsPipeline<
+        Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+        Policy>;
+};
+} // namespace
+
+template <typename InputType_,
+          ck_tile::index_t BlockX_,
+          ck_tile::index_t BlockY_,
+          ck_tile::index_t NumWarpsX_,
+          ck_tile::index_t NumWarpsY_,
+          bool PadM_,
+          bool PadN_,
+          ck_tile::index_t PipelineId_>
+struct BatchedTransposeConfig
+{
+    using InputType                               = InputType_;
+    static constexpr ck_tile::index_t kBlockX     = BlockX_;
+    static constexpr ck_tile::index_t kBlockY     = BlockY_;
+    static constexpr ck_tile::index_t kNumWarpsX  = NumWarpsX_;
+    static constexpr ck_tile::index_t kNumWarpsY  = NumWarpsY_;
+    static constexpr bool kPadM                   = PadM_;
+    static constexpr bool kPadN                   = PadN_;
+    static constexpr ck_tile::index_t kPipelineId = PipelineId_;
+};
+
+template <typename Config>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
 {
    uint32_t dim_stride = a.height * a.width;

    a.dim_stride  = dim_stride;
-    a.dim_block_h = block_y;
-    a.dim_block_w = block_x;
+    a.dim_block_h = Config::kBlockY;
+    a.dim_block_w = Config::kBlockX;

-    using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
-    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
-
-    using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
-    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
-
-    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
+    // TODO: this is fragile and slow to compile
+    using kernel = ck_tile::BatchedTransposeKernel<
+        typename kernel_traits<Config::kPipelineId>::template Pipeline<
+            typename Config::InputType,
+            ck_tile::sequence<Config::kBlockX, Config::kBlockY>,
+            ck_tile::sequence<Config::kNumWarpsX, Config::kNumWarpsY>,
+            Config::kPadM,
+            Config::kPadN>>;

    auto kargs = kernel::MakeKargs(a);

    const dim3 grids      = kernel::GridSize(a);
    constexpr dim3 blocks = kernel::BlockSize();

-    printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
-    printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
-    printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
+    printf("Pipeline: %d\n", Config::kPipelineId);
+    printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z);
+    printf("Block: x=%u y=%u z=%u\n", blocks.x, blocks.y, blocks.z);
+    printf(
+        "Host args: batch=%d, height=%d, width=%d, dim_stride=%d, dim_block_h=%d, dim_block_w=%d\n",
+        a.batch,
+        a.height,
+        a.width,
+        a.dim_stride,
+        a.dim_block_h,
+        a.dim_block_w);
+    printf("kargs: kargs.batch=%d kargs.height=%d kargs.width=%d kargs.dim_stride=%d\n",
           kargs.batch,
           kargs.height,
           kargs.width,
@@ -52,22 +104,29 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
    return ave_time;
 }

-// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)                               \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
+// Param Comb: type_size, block_x & y, WarpNum_x & y
+#define FOREACH_TRANSPOSE_PARAM(F)                          \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 0)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 0) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 0) \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 1)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 1) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 1)

 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
-    static float                                                                                \
-        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
-            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
-    {                                                                                           \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE)          \
+    static float                                                                           \
+        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN##_v##PIPE( \
+            batched_transpose_kargs& a, ck_tile::stream_config& s)                         \
+    {                                                                                      \
+        return batched_transpose_dispatch<                                                 \
+            BatchedTransposeConfig<REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE>>(a, s);    \
    }

 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -76,38 +135,78 @@ float batched_transpose(batched_transpose_trait t,
                        batched_transpose_kargs a,
                        ck_tile::stream_config s)
 {
-    if(t.type == "fp8")
+    if(t.pipeline == "0")
    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
+        if(t.type == "fp8")
        {
-            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v0(a, s);
+            }
        }
-        else
+        else if(t.type == "fp16")
        {
-            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v0(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v0(a, s);
+            }
        }
    }
-    else if(t.type == "fp16")
+    else if(t.pipeline == "1")
    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
+        if(t.type == "fp8")
        {
-            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v1(a, s);
+            }
        }
-        else
+        else if(t.type == "fp16")
        {
-            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
-        }
-    }
-    else if(t.type == "bf16")
-    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
-        {
-            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
-        }
-        else
-        {
-            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v1(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v1(a, s);
+            }
        }
    }
+
    return -1;
 }
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
@@ -102,7 +102,8 @@ auto create_args(int argc, char* argv[])
        .insert("warmup", "50", "number of iterations before benchmark the kernel")
        .insert("repeat", "100", "number of iterations to benchmark the kernel")
        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "t to 1 will print kernel name");
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("pipeline", "0", "0: no LDS usage, 1: LDS-accelerated (gfx950)");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -121,6 +122,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
    int n_repeat           = args.get_int("repeat");
    std::string layout_in  = args.get_str("layout_in");
    std::string layout_out = args.get_str("layout_out");
+    std::string pipeline   = args.get_str("pipeline");
    int seed               = args.get_int("seed");

    int dim_in[4], dim_out[4];
@@ -166,7 +168,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)

    x_dev.ToDevice(x_host.data());

-    auto trait = batched_transpose_trait{prec, layout_in};
+    auto trait = batched_transpose_trait{prec, layout_in, pipeline};

    uint32_t height = nchw2nhwc ? C : H * W;
    uint32_t width  = nchw2nhwc ? H * W : C;
@@ -185,17 +187,15 @@ bool run_batched_transpose(ck_tile::ArgParser args)

    auto ms = batched_transpose(trait, karg, sc);

-    std::size_t num_operations = N * C * H * (W - 1);
-    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
+    std::size_t num_bytes = N * C * H * W * sizeof(Type) * 2; // read + written

-    float ave_time   = ms * 1E-3;
    float gb_per_sec = num_bytes / ms * 1.E-6;
-    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;

    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
-              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
-              << gb_per_sec << " GB/s, " << std::endl;
+              << " : " << std::endl
+              << ms << " ms " << std::endl
+              << gb_per_sec << " GB/s " << std::endl;

    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
           prec.c_str(),
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
@@ -14,6 +14,7 @@ struct batched_transpose_trait
 {
    std::string type;
    std::string layout;
+    std::string pipeline;
 };

 struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
--- a/example/ck_tile/35_batched_transpose/script/perf_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/perf_test.sh
@@ -5,10 +5,14 @@

 EXE=./build/bin/tile_example_batched_transpose

+for C in "64" "256" "1024" "4096" "16384"; do
+for W in "64" "256" "1024" "4096" "16384"; do
 for pr in "fp8" "fp16" "bf16"; do
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
+for pipeline in "0" "1"; do
+
+$EXE -pipeline=$pipeline -pr=$pr -N=1 -C=$C -H=1 -W=$W -layout_in='NCHW' -layout_out='NHWC'

 done
+done
+done
+done
--- a/example/ck_tile/35_batched_transpose/script/smoke_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
@@ -6,25 +6,27 @@
 EXE=./build/bin/tile_example_batched_transpose

 for pr in "fp8" "fp16" "bf16"; do
-$EXE -pr=$pr -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+for pipeline in "0" "1"; do
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'

 done
+done