From 751e29ccb678156e3735a224006711a8faf4e9a4 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Mon, 9 Mar 2026 15:36:30 -0700 Subject: [PATCH] [CK_BUILDER] Clean up ConvDescription output formatting (#5085) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `ConvDescription::getDetailedDescription()` output had several issues that made it harder to read and potentially misleading: 1. **Bug fix**: The LDS padding field was incorrectly displaying `dst_scalar_per_vector_k1` instead of the actual `lds_padding` value 2. **Noise reduction**: Optional parameters that weren't set were printing unhelpful messages like "Struct does not contain optional gemm_padding argument" — these add clutter without providing value to the reader 3. **Formatting inconsistencies**: Trailing spaces after colons (e.g., `"Warp Gemm parameters: "`) and a stray trailing `×` in tile dimensions 4. **Missing thread cluster lengths**: The threads per axis are not shown. **Changes**: - **Fixed the LDS padding bug** by using `traits_.a_tile_transfer.transfer_params.lds_padding` and `traits_.b_tile_transfer.transfer_params.lds_padding` instead of duplicating `dst_scalar_per_vector_k1` - **Simplified optional parameter handling**: Changed from printing "Struct does not contain..." messages to simply omitting absent optional values. Also switched from `.value_or()` to direct dereference (`*`) since we're already inside an `if` check - **Cleaned up formatting**: Removed trailing spaces after colons and the extra `×` at the end of tile dimension lists - **Added missing thread cluster lengths**: Added X×Y×Z" display for both A and B tile transfer sections. - **Fixed typo**: "Do Padd Gemm" → "Do Pad Gemm" - **Fixed typo**: "scr" → "src" - **Fixed typo**: "tensros" → "tensors" - `ninja smoke-builder` ✓ - `ninja check-builder` ✓ The test file updates reflect the corrected expected output, which now shows the actual `lds_padding` values (0 or 1), shows thread cluster lenths, and omits the verbose "Struct does not contain..." lines. **Note**: This PR follows PR #5083. --- .../builder/reflect/conv_description.hpp | 61 +++-- .../ck_tile/builder/reflect/conv_traits.hpp | 2 +- .../builder/test/test_conv_description.cpp | 216 +++++++++--------- 3 files changed, 132 insertions(+), 147 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp index 01069c1140..3595a6bd98 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp @@ -85,19 +85,16 @@ class ConvDescription : public Description "×", traits_.tile_dims.k); if(traits_.gemm_padding) - algo.add("Gemm padding: ", - traits_.gemm_padding.value_or(builder::GemmPadding::DEFAULT)); - else - algo.add("Struct does not contain optional gemm_padding argument"); + algo.add("Gemm padding: ", *traits_.gemm_padding); if(traits_.do_pad_gemm_m) - algo.add("Do Padd Gemm M: ", traits_.do_pad_gemm_m.value_or(false)); + algo.add("Do Pad Gemm M: ", *traits_.do_pad_gemm_m); if(traits_.do_pad_gemm_n) - algo.add("Do Padd Gemm N: ", traits_.do_pad_gemm_n.value_or(false)); + algo.add("Do Pad Gemm N: ", *traits_.do_pad_gemm_n); algo.add("Convolution specialization: ", traits_.conv_specialization); // Pipeline section algo.add("Pipeline version: ", traits_.pipeline_version); algo.add("Pipeline scheduler: ", traits_.pipeline_scheduler); - auto& warpGemm = algo.add("Warp Gemm parameters: "); + auto& warpGemm = algo.add("Warp Gemm parameters:"); warpGemm.add("subtile size: ", traits_.warp_gemm.gemm_m, "×", traits_.warp_gemm.gemm_n); warpGemm.add("Number of warp gemm iterations: ", traits_.warp_gemm.m_iter, @@ -107,16 +104,21 @@ class ConvDescription : public Description // Memory Access section auto& memAccess = algo.add("Memory access:"); - auto& aTile = memAccess.add("A Tile transfer: "); + auto& aTile = memAccess.add("A Tile transfer:"); aTile.add("Tile dimensions: ", traits_.a_tile_transfer.tile_dimensions.k0, "×", traits_.a_tile_transfer.tile_dimensions.m_or_n, "×", - traits_.a_tile_transfer.tile_dimensions.k1, - "×"); + traits_.a_tile_transfer.tile_dimensions.k1); aTile.add("The innermost K subdimension size: ", traits_.a_tile_transfer.transfer_params.k1); + aTile.add("Thread cluster lengths (threads per axis): ", + traits_.a_tile_transfer.transfer_params.thread_cluster_dims[0], + "×", + traits_.a_tile_transfer.transfer_params.thread_cluster_dims[1], + "×", + traits_.a_tile_transfer.transfer_params.thread_cluster_dims[2]); aTile.add("Spatial thread distribution over the data tile: ", traits_.a_tile_transfer.transfer_params.thread_cluster_order[0], "×", @@ -136,18 +138,23 @@ class ConvDescription : public Description aTile.add("Vector access (LDS write) instruction size: ", traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1); aTile.add("LDS data layout padding (to prevent bank conflicts): ", - traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1); + traits_.a_tile_transfer.transfer_params.lds_padding); - auto& bTile = memAccess.add("B Tile transfer: "); + auto& bTile = memAccess.add("B Tile transfer:"); bTile.add("Tile dimensions: ", traits_.b_tile_transfer.tile_dimensions.k0, "×", traits_.b_tile_transfer.tile_dimensions.m_or_n, "×", - traits_.b_tile_transfer.tile_dimensions.k1, - "×"); + traits_.b_tile_transfer.tile_dimensions.k1); bTile.add("The innermost K subdimension size: ", traits_.b_tile_transfer.transfer_params.k1); + bTile.add("Thread cluster lengths (threads per axis): ", + traits_.b_tile_transfer.transfer_params.thread_cluster_dims[0], + "×", + traits_.b_tile_transfer.transfer_params.thread_cluster_dims[1], + "×", + traits_.b_tile_transfer.transfer_params.thread_cluster_dims[2]); bTile.add("Spatial thread distribution over the data tile: ", traits_.b_tile_transfer.transfer_params.thread_cluster_order[0], "×", @@ -167,9 +174,9 @@ class ConvDescription : public Description bTile.add("Vector access (LDS write) instruction size: ", traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1); bTile.add("LDS data layout padding (to prevent bank conflicts): ", - traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1); + traits_.b_tile_transfer.transfer_params.lds_padding); - auto& cTile = memAccess.add("C Tile transfer: "); + auto& cTile = memAccess.add("C Tile transfer:"); cTile.add("Data shuffle (number of gemm instructions per iteration): ", traits_.c_tile_transfer.shuffle_params.m_gemms_per_shuffle, "×", @@ -185,27 +192,15 @@ class ConvDescription : public Description cTile.add("Vector access (GMEM write) instruction size: ", traits_.c_tile_transfer.scalar_per_vector); if(traits_.num_gemm_k_prefetch_stage) - algo.add("Num gemm k prefetch stage: ", traits_.num_gemm_k_prefetch_stage.value_or(0)); - else - algo.add("Struct does not contain optional " - "num_gemm_k_prefetch_stage parameter"); - + algo.add("Num gemm k prefetch stage: ", *traits_.num_gemm_k_prefetch_stage); if(traits_.max_transpose_transfer_src_scalar_per_vector) - algo.add("Max Transpose transfer scr scalar per vector: ", - traits_.max_transpose_transfer_src_scalar_per_vector.value_or(0)); - else - algo.add("Struct does not contain optional " - "max_transpose_transfer_src_scalar_per_vector parameter"); + algo.add("Max Transpose transfer src scalar per vector: ", + *traits_.max_transpose_transfer_src_scalar_per_vector); if(traits_.max_transpose_transfer_dst_scalar_per_vector) algo.add("Max Transpose dst scalar per vector: ", - traits_.max_transpose_transfer_dst_scalar_per_vector.value_or(0)); - else - algo.add("Struct does not contain optional " - "max_transpose_transfer_dst_scalar_per_vector parameter"); + *traits_.max_transpose_transfer_dst_scalar_per_vector); if(traits_.num_groups_to_merge) - algo.add("Num groups to merge: ", traits_.num_groups_to_merge.value_or(0)); - else - algo.add("Struct does not contain optional num_groups_to_merge parameter"); + algo.add("Num groups to merge: ", *traits_.num_groups_to_merge); return root.getString(); } diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp index 21f6525534..318bdf4416 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp @@ -73,7 +73,7 @@ namespace ck_tile::reflect::conv { // There is a lot we still need to do: // // TODO: Generalize type support for all tensors and accumulator. -// TODO: Describe all tensros. +// TODO: Describe all tensors. // TODO: Include the full generalization of the signature from the input schema. // TODO: Include the full generalization of the algorithm from the input schema. struct ConvTraits diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp index aa2700c80e..8d943c7a6d 100644 --- a/experimental/builder/test/test_conv_description.cpp +++ b/experimental/builder/test/test_conv_description.cpp @@ -260,57 +260,52 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription) static constexpr const ConvSignature SIGNATURE; static constexpr const DefaultAlgorithm ALGORITHM; using Instance = ckb::ConvBuilder::Instance; - EXPECT_THAT( - ckr::describe().detailed(), - ckt::StringEqWithDiff( // - "2D Forward Convolution Kernel\n" - "├─ Signature\n" - "│ ├─ Tensor Type: FP16\n" - "│ ├─ Input Layout: GNHWC\n" - "│ ├─ Weight Layout: GKYXC\n" - "│ ├─ Output Layout: GNHWK\n" - "│ ├─ Input elementwise operation: PASS_THROUGH\n" - "│ ├─ Weights elementwise operation: PASS_THROUGH\n" - "│ └─ Output elementwise operation: PASS_THROUGH\n" - "└─ Algorithm\n" - " ├─ Thread block size: 256\n" - " ├─ Data tile size: 256×256×32\n" - " ├─ Gemm padding: DEFAULT\n" - " ├─ Convolution specialization: DEFAULT\n" - " ├─ Pipeline version: V4\n" - " ├─ Pipeline scheduler: INTRAWAVE\n" - " ├─ Warp Gemm parameters: \n" - " │ ├─ subtile size: 16×16\n" - " │ └─ Number of warp gemm iterations: 8×8\n" - " ├─ Memory access:\n" - " │ ├─ A Tile transfer: \n" - " │ │ ├─ Tile dimensions: 4×256×8×\n" - " │ │ ├─ The innermost K subdimension size: 8\n" - " │ │ ├─ Spatial thread distribution over the data tile: 0×1×2\n" - " │ │ ├─ The order of accessing data tile axes: 0×1×2\n" - " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" - " │ │ ├─ Vector access (GMEM read) instruction size: 2\n" - " │ │ ├─ Vector access (LDS write) instruction size: 2\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 2\n" - " │ ├─ B Tile transfer: \n" - " │ │ ├─ Tile dimensions: 4×256×8×\n" - " │ │ ├─ The innermost K subdimension size: 8\n" - " │ │ ├─ Spatial thread distribution over the data tile: 0×1×2\n" - " │ │ ├─ The order of accessing data tile axes: 0×1×2\n" - " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" - " │ │ ├─ Vector access (GMEM read) instruction size: 2\n" - " │ │ ├─ Vector access (LDS write) instruction size: 2\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 2\n" - " │ └─ C Tile transfer: \n" - " │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n" - " │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n" - " │ └─ Vector access (GMEM write) instruction size: 2\n" - " ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n" - " ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector " - "parameter\n" - " ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector " - "parameter\n" - " └─ Struct does not contain optional num_groups_to_merge parameter")); + EXPECT_THAT(ckr::describe().detailed(), + ckt::StringEqWithDiff( // + "2D Forward Convolution Kernel\n" + "├─ Signature\n" + "│ ├─ Tensor Type: FP16\n" + "│ ├─ Input Layout: GNHWC\n" + "│ ├─ Weight Layout: GKYXC\n" + "│ ├─ Output Layout: GNHWK\n" + "│ ├─ Input elementwise operation: PASS_THROUGH\n" + "│ ├─ Weights elementwise operation: PASS_THROUGH\n" + "│ └─ Output elementwise operation: PASS_THROUGH\n" + "└─ Algorithm\n" + " ├─ Thread block size: 256\n" + " ├─ Data tile size: 256×256×32\n" + " ├─ Gemm padding: DEFAULT\n" + " ├─ Convolution specialization: DEFAULT\n" + " ├─ Pipeline version: V4\n" + " ├─ Pipeline scheduler: INTRAWAVE\n" + " ├─ Warp Gemm parameters:\n" + " │ ├─ subtile size: 16×16\n" + " │ └─ Number of warp gemm iterations: 8×8\n" + " └─ Memory access:\n" + " ├─ A Tile transfer:\n" + " │ ├─ Tile dimensions: 4×256×8\n" + " │ ├─ The innermost K subdimension size: 8\n" + " │ ├─ Thread cluster lengths (threads per axis): 1×128×2\n" + " │ ├─ Spatial thread distribution over the data tile: 0×1×2\n" + " │ ├─ The order of accessing data tile axes: 0×1×2\n" + " │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" + " │ ├─ Vector access (GMEM read) instruction size: 2\n" + " │ ├─ Vector access (LDS write) instruction size: 2\n" + " │ └─ LDS data layout padding (to prevent bank conflicts): 0\n" + " ├─ B Tile transfer:\n" + " │ ├─ Tile dimensions: 4×256×8\n" + " │ ├─ The innermost K subdimension size: 8\n" + " │ ├─ Thread cluster lengths (threads per axis): 1×128×2\n" + " │ ├─ Spatial thread distribution over the data tile: 0×1×2\n" + " │ ├─ The order of accessing data tile axes: 0×1×2\n" + " │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" + " │ ├─ Vector access (GMEM read) instruction size: 2\n" + " │ ├─ Vector access (LDS write) instruction size: 2\n" + " │ └─ LDS data layout padding (to prevent bank conflicts): 0\n" + " └─ C Tile transfer:\n" + " ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n" + " ├─ Spatial thread distribution used to store data: 1×32×1×8\n" + " └─ Vector access (GMEM write) instruction size: 2")); } // Test printing of optional parameters num_groups_to_merge, @@ -384,38 +379,38 @@ TEST(ConvDescriptionTest, BwdWeightTwoStageWmmaV3DescriptionTest) "└─ Algorithm\n" " ├─ Thread block size: 256\n" " ├─ Data tile size: 128×128×16\n" - " ├─ Struct does not contain optional gemm_padding argument\n" " ├─ Convolution specialization: DEFAULT\n" " ├─ Pipeline version: V1\n" " ├─ Pipeline scheduler: DEFAULT\n" - " ├─ Warp Gemm parameters: \n" + " ├─ Warp Gemm parameters:\n" " │ ├─ subtile size: 32×32\n" " │ └─ Number of warp gemm iterations: 4×4\n" " ├─ Memory access:\n" - " │ ├─ A Tile transfer: \n" - " │ │ ├─ Tile dimensions: 2×128×8×\n" + " │ ├─ A Tile transfer:\n" + " │ │ ├─ Tile dimensions: 2×128×8\n" " │ │ ├─ The innermost K subdimension size: 8\n" + " │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n" " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" " │ │ ├─ Vector access (LDS write) instruction size: 8\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n" - " │ ├─ B Tile transfer: \n" - " │ │ ├─ Tile dimensions: 2×128×8×\n" + " │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n" + " │ ├─ B Tile transfer:\n" + " │ │ ├─ Tile dimensions: 2×128×8\n" " │ │ ├─ The innermost K subdimension size: 8\n" + " │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n" " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" " │ │ ├─ Vector access (LDS write) instruction size: 8\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n" - " │ └─ C Tile transfer: \n" + " │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n" + " │ └─ C Tile transfer:\n" " │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n" " │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n" " │ └─ Vector access (GMEM write) instruction size: 8\n" - " ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n" - " ├─ Max Transpose transfer scr scalar per vector: 1\n" + " ├─ Max Transpose transfer src scalar per vector: 1\n" " ├─ Max Transpose dst scalar per vector: 1\n" " └─ Num groups to merge: 4")); } @@ -473,57 +468,52 @@ TEST(ConvDescriptionTest, BwdWeightWmmaCshuffleV3DescriptionTest) ck::PipelineVersion::v1, // BlkGemmPipelineVer false>; // BComputeDataType - EXPECT_THAT( - ckr::describe().detailed(), - ckt::StringEqWithDiff( // - "3D Backward Weight Convolution Kernel\n" - "├─ Signature\n" - "│ ├─ Tensor Type: FP16\n" - "│ ├─ Input Layout: GNDHWC\n" - "│ ├─ Weight Layout: GKZYXC\n" - "│ ├─ Output Layout: GNDHWK\n" - "│ ├─ Input elementwise operation: PASS_THROUGH\n" - "│ ├─ Weights elementwise operation: PASS_THROUGH\n" - "│ └─ Output elementwise operation: PASS_THROUGH\n" - "└─ Algorithm\n" - " ├─ Thread block size: 256\n" - " ├─ Data tile size: 128×128×16\n" - " ├─ Struct does not contain optional gemm_padding argument\n" - " ├─ Convolution specialization: DEFAULT\n" - " ├─ Pipeline version: V1\n" - " ├─ Pipeline scheduler: DEFAULT\n" - " ├─ Warp Gemm parameters: \n" - " │ ├─ subtile size: 32×32\n" - " │ └─ Number of warp gemm iterations: 4×4\n" - " ├─ Memory access:\n" - " │ ├─ A Tile transfer: \n" - " │ │ ├─ Tile dimensions: 2×128×8×\n" - " │ │ ├─ The innermost K subdimension size: 8\n" - " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" - " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" - " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" - " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" - " │ │ ├─ Vector access (LDS write) instruction size: 8\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n" - " │ ├─ B Tile transfer: \n" - " │ │ ├─ Tile dimensions: 2×128×8×\n" - " │ │ ├─ The innermost K subdimension size: 8\n" - " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" - " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" - " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" - " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" - " │ │ ├─ Vector access (LDS write) instruction size: 8\n" - " │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n" - " │ └─ C Tile transfer: \n" - " │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n" - " │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n" - " │ └─ Vector access (GMEM write) instruction size: 8\n" - " ├─ Num gemm k prefetch stage: 1\n" - " ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector " - "parameter\n" - " ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector " - "parameter\n" - " └─ Struct does not contain optional num_groups_to_merge parameter")); + EXPECT_THAT(ckr::describe().detailed(), + ckt::StringEqWithDiff( // + "3D Backward Weight Convolution Kernel\n" + "├─ Signature\n" + "│ ├─ Tensor Type: FP16\n" + "│ ├─ Input Layout: GNDHWC\n" + "│ ├─ Weight Layout: GKZYXC\n" + "│ ├─ Output Layout: GNDHWK\n" + "│ ├─ Input elementwise operation: PASS_THROUGH\n" + "│ ├─ Weights elementwise operation: PASS_THROUGH\n" + "│ └─ Output elementwise operation: PASS_THROUGH\n" + "└─ Algorithm\n" + " ├─ Thread block size: 256\n" + " ├─ Data tile size: 128×128×16\n" + " ├─ Convolution specialization: DEFAULT\n" + " ├─ Pipeline version: V1\n" + " ├─ Pipeline scheduler: DEFAULT\n" + " ├─ Warp Gemm parameters:\n" + " │ ├─ subtile size: 32×32\n" + " │ └─ Number of warp gemm iterations: 4×4\n" + " ├─ Memory access:\n" + " │ ├─ A Tile transfer:\n" + " │ │ ├─ Tile dimensions: 2×128×8\n" + " │ │ ├─ The innermost K subdimension size: 8\n" + " │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n" + " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" + " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" + " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" + " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" + " │ │ ├─ Vector access (LDS write) instruction size: 8\n" + " │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n" + " │ ├─ B Tile transfer:\n" + " │ │ ├─ Tile dimensions: 2×128×8\n" + " │ │ ├─ The innermost K subdimension size: 8\n" + " │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n" + " │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n" + " │ │ ├─ The order of accessing data tile axes: 1×0×2\n" + " │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n" + " │ │ ├─ Vector access (GMEM read) instruction size: 8\n" + " │ │ ├─ Vector access (LDS write) instruction size: 8\n" + " │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n" + " │ └─ C Tile transfer:\n" + " │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n" + " │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n" + " │ └─ Vector access (GMEM write) instruction size: 8\n" + " └─ Num gemm k prefetch stage: 1")); } TEST(ConvDescriptionTest, DefaultInstanceHasInstanceString)