From 751e29ccb678156e3735a224006711a8faf4e9a4 Mon Sep 17 00:00:00 2001
From: John Shumway <jshumway@amd.com>
Date: Mon, 9 Mar 2026 15:36:30 -0700
Subject: [PATCH] [CK_BUILDER] Clean up ConvDescription output formatting
 (#5085)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `ConvDescription::getDetailedDescription()` output had several
issues that made it harder to read and potentially misleading:

1. **Bug fix**: The LDS padding field was incorrectly displaying
`dst_scalar_per_vector_k1` instead of the actual `lds_padding` value
2. **Noise reduction**: Optional parameters that weren't set were
printing unhelpful messages like "Struct does not contain optional
gemm_padding argument" — these add clutter without providing value to
the reader
3. **Formatting inconsistencies**: Trailing spaces after colons (e.g.,
`"Warp Gemm parameters: "`) and a stray trailing `×` in tile dimensions
4. **Missing thread cluster lengths**: The threads per axis are not
shown.

**Changes**:

- **Fixed the LDS padding bug** by using
`traits_.a_tile_transfer.transfer_params.lds_padding` and
`traits_.b_tile_transfer.transfer_params.lds_padding` instead of
duplicating `dst_scalar_per_vector_k1`
- **Simplified optional parameter handling**: Changed from printing
"Struct does not contain..." messages to simply omitting absent optional
values. Also switched from `.value_or()` to direct dereference (`*`)
since we're already inside an `if` check
- **Cleaned up formatting**: Removed trailing spaces after colons and
the extra `×` at the end of tile dimension lists
- **Added missing thread cluster lengths**: Added X×Y×Z" display for
both A and B tile transfer sections.
- **Fixed typo**: "Do Padd Gemm" → "Do Pad Gemm"
- **Fixed typo**: "scr" → "src"
- **Fixed typo**: "tensros" → "tensors"

- `ninja smoke-builder` ✓
- `ninja check-builder` ✓

The test file updates reflect the corrected expected output, which now
shows the actual `lds_padding` values (0 or 1), shows thread cluster
lenths, and omits the verbose "Struct does not contain..." lines.

**Note**: This PR follows PR #5083.
---
 .../builder/reflect/conv_description.hpp      |  61 +++--
 .../ck_tile/builder/reflect/conv_traits.hpp   |   2 +-
 .../builder/test/test_conv_description.cpp    | 216 +++++++++---------
 3 files changed, 132 insertions(+), 147 deletions(-)

diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
index 01069c1140..3595a6bd98 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
@@ -85,19 +85,16 @@ class ConvDescription : public Description
                  "×",
                  traits_.tile_dims.k);
         if(traits_.gemm_padding)
-            algo.add("Gemm padding: ",
-                     traits_.gemm_padding.value_or(builder::GemmPadding::DEFAULT));
-        else
-            algo.add("Struct does not contain optional gemm_padding argument");
+            algo.add("Gemm padding: ", *traits_.gemm_padding);
         if(traits_.do_pad_gemm_m)
-            algo.add("Do Padd Gemm M: ", traits_.do_pad_gemm_m.value_or(false));
+            algo.add("Do Pad Gemm M: ", *traits_.do_pad_gemm_m);
         if(traits_.do_pad_gemm_n)
-            algo.add("Do Padd Gemm N: ", traits_.do_pad_gemm_n.value_or(false));
+            algo.add("Do Pad Gemm N: ", *traits_.do_pad_gemm_n);
         algo.add("Convolution specialization: ", traits_.conv_specialization);
         // Pipeline section
         algo.add("Pipeline version: ", traits_.pipeline_version);
         algo.add("Pipeline scheduler: ", traits_.pipeline_scheduler);
-        auto& warpGemm = algo.add("Warp Gemm parameters: ");
+        auto& warpGemm = algo.add("Warp Gemm parameters:");
         warpGemm.add("subtile size: ", traits_.warp_gemm.gemm_m, "×", traits_.warp_gemm.gemm_n);
         warpGemm.add("Number of warp gemm iterations: ",
                      traits_.warp_gemm.m_iter,
@@ -107,16 +104,21 @@ class ConvDescription : public Description
         // Memory Access section
         auto& memAccess = algo.add("Memory access:");
 
-        auto& aTile = memAccess.add("A Tile transfer: ");
+        auto& aTile = memAccess.add("A Tile transfer:");
         aTile.add("Tile dimensions: ",
                   traits_.a_tile_transfer.tile_dimensions.k0,
                   "×",
                   traits_.a_tile_transfer.tile_dimensions.m_or_n,
                   "×",
-                  traits_.a_tile_transfer.tile_dimensions.k1,
-                  "×");
+                  traits_.a_tile_transfer.tile_dimensions.k1);
         aTile.add("The innermost K subdimension size: ",
                   traits_.a_tile_transfer.transfer_params.k1);
+        aTile.add("Thread cluster lengths (threads per axis): ",
+                  traits_.a_tile_transfer.transfer_params.thread_cluster_dims[0],
+                  "×",
+                  traits_.a_tile_transfer.transfer_params.thread_cluster_dims[1],
+                  "×",
+                  traits_.a_tile_transfer.transfer_params.thread_cluster_dims[2]);
         aTile.add("Spatial thread distribution over the data tile: ",
                   traits_.a_tile_transfer.transfer_params.thread_cluster_order[0],
                   "×",
@@ -136,18 +138,23 @@ class ConvDescription : public Description
         aTile.add("Vector access (LDS write) instruction size: ",
                   traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
         aTile.add("LDS data layout padding (to prevent bank conflicts): ",
-                  traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+                  traits_.a_tile_transfer.transfer_params.lds_padding);
 
-        auto& bTile = memAccess.add("B Tile transfer: ");
+        auto& bTile = memAccess.add("B Tile transfer:");
         bTile.add("Tile dimensions: ",
                   traits_.b_tile_transfer.tile_dimensions.k0,
                   "×",
                   traits_.b_tile_transfer.tile_dimensions.m_or_n,
                   "×",
-                  traits_.b_tile_transfer.tile_dimensions.k1,
-                  "×");
+                  traits_.b_tile_transfer.tile_dimensions.k1);
         bTile.add("The innermost K subdimension size: ",
                   traits_.b_tile_transfer.transfer_params.k1);
+        bTile.add("Thread cluster lengths (threads per axis): ",
+                  traits_.b_tile_transfer.transfer_params.thread_cluster_dims[0],
+                  "×",
+                  traits_.b_tile_transfer.transfer_params.thread_cluster_dims[1],
+                  "×",
+                  traits_.b_tile_transfer.transfer_params.thread_cluster_dims[2]);
         bTile.add("Spatial thread distribution over the data tile: ",
                   traits_.b_tile_transfer.transfer_params.thread_cluster_order[0],
                   "×",
@@ -167,9 +174,9 @@ class ConvDescription : public Description
         bTile.add("Vector access (LDS write) instruction size: ",
                   traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
         bTile.add("LDS data layout padding (to prevent bank conflicts): ",
-                  traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+                  traits_.b_tile_transfer.transfer_params.lds_padding);
 
-        auto& cTile = memAccess.add("C Tile transfer: ");
+        auto& cTile = memAccess.add("C Tile transfer:");
         cTile.add("Data shuffle (number of gemm instructions per iteration): ",
                   traits_.c_tile_transfer.shuffle_params.m_gemms_per_shuffle,
                   "×",
@@ -185,27 +192,15 @@ class ConvDescription : public Description
         cTile.add("Vector access (GMEM write) instruction size: ",
                   traits_.c_tile_transfer.scalar_per_vector);
         if(traits_.num_gemm_k_prefetch_stage)
-            algo.add("Num gemm k prefetch stage: ", traits_.num_gemm_k_prefetch_stage.value_or(0));
-        else
-            algo.add("Struct does not contain optional "
-                     "num_gemm_k_prefetch_stage parameter");
-
+            algo.add("Num gemm k prefetch stage: ", *traits_.num_gemm_k_prefetch_stage);
         if(traits_.max_transpose_transfer_src_scalar_per_vector)
-            algo.add("Max Transpose transfer scr scalar per vector: ",
-                     traits_.max_transpose_transfer_src_scalar_per_vector.value_or(0));
-        else
-            algo.add("Struct does not contain optional "
-                     "max_transpose_transfer_src_scalar_per_vector parameter");
+            algo.add("Max Transpose transfer src scalar per vector: ",
+                     *traits_.max_transpose_transfer_src_scalar_per_vector);
         if(traits_.max_transpose_transfer_dst_scalar_per_vector)
             algo.add("Max Transpose dst scalar per vector: ",
-                     traits_.max_transpose_transfer_dst_scalar_per_vector.value_or(0));
-        else
-            algo.add("Struct does not contain optional "
-                     "max_transpose_transfer_dst_scalar_per_vector parameter");
+                     *traits_.max_transpose_transfer_dst_scalar_per_vector);
         if(traits_.num_groups_to_merge)
-            algo.add("Num groups to merge: ", traits_.num_groups_to_merge.value_or(0));
-        else
-            algo.add("Struct does not contain optional num_groups_to_merge parameter");
+            algo.add("Num groups to merge: ", *traits_.num_groups_to_merge);
 
         return root.getString();
     }
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
index 21f6525534..318bdf4416 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
@@ -73,7 +73,7 @@ namespace ck_tile::reflect::conv {
 // There is a lot we still need to do:
 //
 // TODO: Generalize type support for all tensors and accumulator.
-// TODO: Describe all tensros.
+// TODO: Describe all tensors.
 // TODO: Include the full generalization of the signature from the input schema.
 // TODO: Include the full generalization of the algorithm from the input schema.
 struct ConvTraits
diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp
index aa2700c80e..8d943c7a6d 100644
--- a/experimental/builder/test/test_conv_description.cpp
+++ b/experimental/builder/test/test_conv_description.cpp
@@ -260,57 +260,52 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
     static constexpr const ConvSignature SIGNATURE;
     static constexpr const DefaultAlgorithm ALGORITHM;
     using Instance = ckb::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
-    EXPECT_THAT(
-        ckr::describe<Instance>().detailed(),
-        ckt::StringEqWithDiff( //
-            "2D Forward Convolution Kernel\n"
-            "├─ Signature\n"
-            "│  ├─ Tensor Type: FP16\n"
-            "│  ├─ Input Layout: GNHWC\n"
-            "│  ├─ Weight Layout: GKYXC\n"
-            "│  ├─ Output Layout: GNHWK\n"
-            "│  ├─ Input elementwise operation: PASS_THROUGH\n"
-            "│  ├─ Weights elementwise operation: PASS_THROUGH\n"
-            "│  └─ Output elementwise operation: PASS_THROUGH\n"
-            "└─ Algorithm\n"
-            "   ├─ Thread block size: 256\n"
-            "   ├─ Data tile size: 256×256×32\n"
-            "   ├─ Gemm padding: DEFAULT\n"
-            "   ├─ Convolution specialization: DEFAULT\n"
-            "   ├─ Pipeline version: V4\n"
-            "   ├─ Pipeline scheduler: INTRAWAVE\n"
-            "   ├─ Warp Gemm parameters: \n"
-            "   │  ├─ subtile size: 16×16\n"
-            "   │  └─ Number of warp gemm iterations: 8×8\n"
-            "   ├─ Memory access:\n"
-            "   │  ├─ A Tile transfer: \n"
-            "   │  │  ├─ Tile dimensions: 4×256×8×\n"
-            "   │  │  ├─ The innermost K subdimension size: 8\n"
-            "   │  │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
-            "   │  │  ├─ The order of accessing data tile axes: 0×1×2\n"
-            "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
-            "   │  │  ├─ Vector access (GMEM read) instruction size: 2\n"
-            "   │  │  ├─ Vector access (LDS write) instruction size: 2\n"
-            "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 2\n"
-            "   │  ├─ B Tile transfer: \n"
-            "   │  │  ├─ Tile dimensions: 4×256×8×\n"
-            "   │  │  ├─ The innermost K subdimension size: 8\n"
-            "   │  │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
-            "   │  │  ├─ The order of accessing data tile axes: 0×1×2\n"
-            "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
-            "   │  │  ├─ Vector access (GMEM read) instruction size: 2\n"
-            "   │  │  ├─ Vector access (LDS write) instruction size: 2\n"
-            "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 2\n"
-            "   │  └─ C Tile transfer: \n"
-            "   │     ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
-            "   │     ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
-            "   │     └─ Vector access (GMEM write) instruction size: 2\n"
-            "   ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n"
-            "   ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector "
-            "parameter\n"
-            "   ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector "
-            "parameter\n"
-            "   └─ Struct does not contain optional num_groups_to_merge parameter"));
+    EXPECT_THAT(ckr::describe<Instance>().detailed(),
+                ckt::StringEqWithDiff( //
+                    "2D Forward Convolution Kernel\n"
+                    "├─ Signature\n"
+                    "│  ├─ Tensor Type: FP16\n"
+                    "│  ├─ Input Layout: GNHWC\n"
+                    "│  ├─ Weight Layout: GKYXC\n"
+                    "│  ├─ Output Layout: GNHWK\n"
+                    "│  ├─ Input elementwise operation: PASS_THROUGH\n"
+                    "│  ├─ Weights elementwise operation: PASS_THROUGH\n"
+                    "│  └─ Output elementwise operation: PASS_THROUGH\n"
+                    "└─ Algorithm\n"
+                    "   ├─ Thread block size: 256\n"
+                    "   ├─ Data tile size: 256×256×32\n"
+                    "   ├─ Gemm padding: DEFAULT\n"
+                    "   ├─ Convolution specialization: DEFAULT\n"
+                    "   ├─ Pipeline version: V4\n"
+                    "   ├─ Pipeline scheduler: INTRAWAVE\n"
+                    "   ├─ Warp Gemm parameters:\n"
+                    "   │  ├─ subtile size: 16×16\n"
+                    "   │  └─ Number of warp gemm iterations: 8×8\n"
+                    "   └─ Memory access:\n"
+                    "      ├─ A Tile transfer:\n"
+                    "      │  ├─ Tile dimensions: 4×256×8\n"
+                    "      │  ├─ The innermost K subdimension size: 8\n"
+                    "      │  ├─ Thread cluster lengths (threads per axis): 1×128×2\n"
+                    "      │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
+                    "      │  ├─ The order of accessing data tile axes: 0×1×2\n"
+                    "      │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "      │  ├─ Vector access (GMEM read) instruction size: 2\n"
+                    "      │  ├─ Vector access (LDS write) instruction size: 2\n"
+                    "      │  └─ LDS data layout padding (to prevent bank conflicts): 0\n"
+                    "      ├─ B Tile transfer:\n"
+                    "      │  ├─ Tile dimensions: 4×256×8\n"
+                    "      │  ├─ The innermost K subdimension size: 8\n"
+                    "      │  ├─ Thread cluster lengths (threads per axis): 1×128×2\n"
+                    "      │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
+                    "      │  ├─ The order of accessing data tile axes: 0×1×2\n"
+                    "      │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "      │  ├─ Vector access (GMEM read) instruction size: 2\n"
+                    "      │  ├─ Vector access (LDS write) instruction size: 2\n"
+                    "      │  └─ LDS data layout padding (to prevent bank conflicts): 0\n"
+                    "      └─ C Tile transfer:\n"
+                    "         ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
+                    "         ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
+                    "         └─ Vector access (GMEM write) instruction size: 2"));
 }
 
 // Test printing of optional parameters num_groups_to_merge,
@@ -384,38 +379,38 @@ TEST(ConvDescriptionTest, BwdWeightTwoStageWmmaV3DescriptionTest)
                     "└─ Algorithm\n"
                     "   ├─ Thread block size: 256\n"
                     "   ├─ Data tile size: 128×128×16\n"
-                    "   ├─ Struct does not contain optional gemm_padding argument\n"
                     "   ├─ Convolution specialization: DEFAULT\n"
                     "   ├─ Pipeline version: V1\n"
                     "   ├─ Pipeline scheduler: DEFAULT\n"
-                    "   ├─ Warp Gemm parameters: \n"
+                    "   ├─ Warp Gemm parameters:\n"
                     "   │  ├─ subtile size: 32×32\n"
                     "   │  └─ Number of warp gemm iterations: 4×4\n"
                     "   ├─ Memory access:\n"
-                    "   │  ├─ A Tile transfer: \n"
-                    "   │  │  ├─ Tile dimensions: 2×128×8×\n"
+                    "   │  ├─ A Tile transfer:\n"
+                    "   │  │  ├─ Tile dimensions: 2×128×8\n"
                     "   │  │  ├─ The innermost K subdimension size: 8\n"
+                    "   │  │  ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
                     "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
                     "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
                     "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
                     "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
                     "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
-                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
-                    "   │  ├─ B Tile transfer: \n"
-                    "   │  │  ├─ Tile dimensions: 2×128×8×\n"
+                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 1\n"
+                    "   │  ├─ B Tile transfer:\n"
+                    "   │  │  ├─ Tile dimensions: 2×128×8\n"
                     "   │  │  ├─ The innermost K subdimension size: 8\n"
+                    "   │  │  ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
                     "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
                     "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
                     "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
                     "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
                     "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
-                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
-                    "   │  └─ C Tile transfer: \n"
+                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 1\n"
+                    "   │  └─ C Tile transfer:\n"
                     "   │     ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
                     "   │     ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
                     "   │     └─ Vector access (GMEM write) instruction size: 8\n"
-                    "   ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n"
-                    "   ├─ Max Transpose transfer scr scalar per vector: 1\n"
+                    "   ├─ Max Transpose transfer src scalar per vector: 1\n"
                     "   ├─ Max Transpose dst scalar per vector: 1\n"
                     "   └─ Num groups to merge: 4"));
 }
@@ -473,57 +468,52 @@ TEST(ConvDescriptionTest, BwdWeightWmmaCshuffleV3DescriptionTest)
         ck::PipelineVersion::v1,    // BlkGemmPipelineVer
         false>;                     // BComputeDataType
 
-    EXPECT_THAT(
-        ckr::describe<Instance>().detailed(),
-        ckt::StringEqWithDiff( //
-            "3D Backward Weight Convolution Kernel\n"
-            "├─ Signature\n"
-            "│  ├─ Tensor Type: FP16\n"
-            "│  ├─ Input Layout: GNDHWC\n"
-            "│  ├─ Weight Layout: GKZYXC\n"
-            "│  ├─ Output Layout: GNDHWK\n"
-            "│  ├─ Input elementwise operation: PASS_THROUGH\n"
-            "│  ├─ Weights elementwise operation: PASS_THROUGH\n"
-            "│  └─ Output elementwise operation: PASS_THROUGH\n"
-            "└─ Algorithm\n"
-            "   ├─ Thread block size: 256\n"
-            "   ├─ Data tile size: 128×128×16\n"
-            "   ├─ Struct does not contain optional gemm_padding argument\n"
-            "   ├─ Convolution specialization: DEFAULT\n"
-            "   ├─ Pipeline version: V1\n"
-            "   ├─ Pipeline scheduler: DEFAULT\n"
-            "   ├─ Warp Gemm parameters: \n"
-            "   │  ├─ subtile size: 32×32\n"
-            "   │  └─ Number of warp gemm iterations: 4×4\n"
-            "   ├─ Memory access:\n"
-            "   │  ├─ A Tile transfer: \n"
-            "   │  │  ├─ Tile dimensions: 2×128×8×\n"
-            "   │  │  ├─ The innermost K subdimension size: 8\n"
-            "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
-            "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
-            "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
-            "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
-            "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
-            "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
-            "   │  ├─ B Tile transfer: \n"
-            "   │  │  ├─ Tile dimensions: 2×128×8×\n"
-            "   │  │  ├─ The innermost K subdimension size: 8\n"
-            "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
-            "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
-            "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
-            "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
-            "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
-            "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
-            "   │  └─ C Tile transfer: \n"
-            "   │     ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
-            "   │     ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
-            "   │     └─ Vector access (GMEM write) instruction size: 8\n"
-            "   ├─ Num gemm k prefetch stage: 1\n"
-            "   ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector "
-            "parameter\n"
-            "   ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector "
-            "parameter\n"
-            "   └─ Struct does not contain optional num_groups_to_merge parameter"));
+    EXPECT_THAT(ckr::describe<Instance>().detailed(),
+                ckt::StringEqWithDiff( //
+                    "3D Backward Weight Convolution Kernel\n"
+                    "├─ Signature\n"
+                    "│  ├─ Tensor Type: FP16\n"
+                    "│  ├─ Input Layout: GNDHWC\n"
+                    "│  ├─ Weight Layout: GKZYXC\n"
+                    "│  ├─ Output Layout: GNDHWK\n"
+                    "│  ├─ Input elementwise operation: PASS_THROUGH\n"
+                    "│  ├─ Weights elementwise operation: PASS_THROUGH\n"
+                    "│  └─ Output elementwise operation: PASS_THROUGH\n"
+                    "└─ Algorithm\n"
+                    "   ├─ Thread block size: 256\n"
+                    "   ├─ Data tile size: 128×128×16\n"
+                    "   ├─ Convolution specialization: DEFAULT\n"
+                    "   ├─ Pipeline version: V1\n"
+                    "   ├─ Pipeline scheduler: DEFAULT\n"
+                    "   ├─ Warp Gemm parameters:\n"
+                    "   │  ├─ subtile size: 32×32\n"
+                    "   │  └─ Number of warp gemm iterations: 4×4\n"
+                    "   ├─ Memory access:\n"
+                    "   │  ├─ A Tile transfer:\n"
+                    "   │  │  ├─ Tile dimensions: 2×128×8\n"
+                    "   │  │  ├─ The innermost K subdimension size: 8\n"
+                    "   │  │  ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
+                    "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
+                    "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
+                    "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
+                    "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
+                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 1\n"
+                    "   │  ├─ B Tile transfer:\n"
+                    "   │  │  ├─ Tile dimensions: 2×128×8\n"
+                    "   │  │  ├─ The innermost K subdimension size: 8\n"
+                    "   │  │  ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
+                    "   │  │  ├─ Spatial thread distribution over the data tile: 1×0×2\n"
+                    "   │  │  ├─ The order of accessing data tile axes: 1×0×2\n"
+                    "   │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "   │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
+                    "   │  │  ├─ Vector access (LDS write) instruction size: 8\n"
+                    "   │  │  └─ LDS data layout padding (to prevent bank conflicts): 1\n"
+                    "   │  └─ C Tile transfer:\n"
+                    "   │     ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
+                    "   │     ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
+                    "   │     └─ Vector access (GMEM write) instruction size: 8\n"
+                    "   └─ Num gemm k prefetch stage: 1"));
 }
 
 TEST(ConvDescriptionTest, DefaultInstanceHasInstanceString)