[CK-Tile] Merge transpose examples (#2450)

* unify pipeline signature with existing example * iwyu * move stuff around in load-tile-transpose * cleanups in batched transpose pipeline * comments * use same inputs size * cleaner printf * print host args * use 64 block sides in the 37_transpose example * roll back grid dimension size adjustment for 37_transpose example * transpose grid for 37_transpose to unify with 35_batched_transpose * unify grid computation logic * make policy methods device only (since they are used only on device from the pipeline) * more host/device attribute cleanups * copy over problem * move over pipeline and policy * add switch to batched transpose api * make the lds problem more similar to original problem * factor out logic into traits * factor out conditional compilation into trait parameter * propagate pipeline to args * unhardcode pipeline dispatch parameter * refactor vector size * put warp tile out of dispatch * rename template parameter for trait * rewrite vector size in terms of problem * mark policy-internal struct variable as device * factor out input distribution and thread access pattern from policies * reword vector size * use datatype across batched transpose pipelines, problems and kernel * remove transpose traits from lds pipeline * add padding to the lds pipeline *interface* * add comment * remove ck_tile example #37 * update cmakelists * add test for new pipeline * update batched transpose test * roll back load_tile_transpose changes * remove comments * pack dispatch parameters into a config * padM can be enabled * adjust lds vector size to enable padding along N * update test * clean up logic * swap m/n input vector size * adjust perf test script * sweep over C/W in perf test * count both read and written bytes into bandwidth (x2 the number) * clang-format * widen size range for perf test * remove 64k x 64k case; it's too large for index * remove thread tile from dispatch * Solve merge conflict * fix compile * modify the transpose * solve the test error and clang format * Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463) * Add logging to IsSupported. * Less casting in AddClamp * Conv+bias+clamp instances & profiler BF16 * Fix 3D instances & run just 1x for verification. * :Run just once for verification conv fwd. * ckProfiler conv fwd clampwq * Remove exec bit & formatting * Add support for MultiD for grouped conv fwd v3. * Enable 2Lds. * clean * align instances * align instances * profiler fixes * Fixes * fix * fix --------- Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Fixing 0ms and inf GB/s issue in img2col (#2565) issue : ==== ``` sh $ bin/tile_example_img2col Perf: 0 ms, inf GB/s ``` solution : ====== Problem occured because config.time_kernel is false by default. if false, then no need to calculate perf, just print proper message `image_to_coloumn: pass, No Perf generated due to config.time_kernel=0` * merge with develop * solve clang format --------- Co-authored-by: ThomasNing <thomas.ning@amd.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
2026-05-04 05:31:24 +00:00 · 2025-07-26 21:51:54 -07:00
parent d2459878cf
commit 821cd26c13
24 changed files with 431 additions and 869 deletions
--- a/test/ck_tile/batched_transpose/batched_transpose_api.cpp
+++ b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
@@ -7,8 +7,6 @@ template <typename ts_type,
          ck_tile::index_t block_y,
          ck_tile::index_t warp_x,
          ck_tile::index_t warp_y,
-          ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y,
          bool kPadM,
          bool kPadN>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
@@ -20,11 +18,10 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
    a.dim_block_w = block_x;

    using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
-    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
+    using warp_layout = ck_tile::sequence<warp_x, warp_y>;

    using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;

    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
@@ -53,21 +50,20 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
 }

 // Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)                               \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
+#define FOREACH_TRANSPOSE_PARAM(F)                       \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false)

 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
-    static float                                                                                \
-        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
-            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
-    {                                                                                           \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN)               \
+    static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN( \
+        batched_transpose_kargs& a, ck_tile::stream_config& s)                            \
+    {                                                                                     \
+        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, PADM, PADN>(a, s);   \
    }

 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -80,33 +76,33 @@ float batched_transpose(batched_transpose_trait t,
    {
        if(a.height % 64 == 0 && a.width % 64 == 0)
        {
-            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_fp8_64_64_1_1_false_false(a, s);
        }
        else
        {
-            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_fp8_64_64_1_1_true_true(a, s);
        }
    }
    else if(t.type == "fp16")
    {
        if(a.height % 64 == 0 && a.width % 64 == 0)
        {
-            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_fp16_64_64_1_1_false_false(a, s);
        }
        else
        {
-            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_fp16_64_64_1_1_true_true(a, s);
        }
    }
    else if(t.type == "bf16")
    {
        if(a.height % 64 == 0 && a.width % 64 == 0)
        {
-            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_bf16_64_64_1_1_false_false(a, s);
        }
        else
        {
-            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_bf16_64_64_1_1_true_true(a, s);
        }
    }
    return -1;