mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
[rocm-libraries] ROCm/rocm-libraries#5085 (commit bb9cb27)
[CK_BUILDER] Clean up ConvDescription output formatting (#5085) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `ConvDescription::getDetailedDescription()` output had several issues that made it harder to read and potentially misleading: 1. **Bug fix**: The LDS padding field was incorrectly displaying `dst_scalar_per_vector_k1` instead of the actual `lds_padding` value 2. **Noise reduction**: Optional parameters that weren't set were printing unhelpful messages like "Struct does not contain optional gemm_padding argument" — these add clutter without providing value to the reader 3. **Formatting inconsistencies**: Trailing spaces after colons (e.g., `"Warp Gemm parameters: "`) and a stray trailing `×` in tile dimensions 4. **Missing thread cluster lengths**: The threads per axis are not shown. **Changes**: - **Fixed the LDS padding bug** by using `traits_.a_tile_transfer.transfer_params.lds_padding` and `traits_.b_tile_transfer.transfer_params.lds_padding` instead of duplicating `dst_scalar_per_vector_k1` - **Simplified optional parameter handling**: Changed from printing "Struct does not contain..." messages to simply omitting absent optional values. Also switched from `.value_or()` to direct dereference (`*`) since we're already inside an `if` check - **Cleaned up formatting**: Removed trailing spaces after colons and the extra `×` at the end of tile dimension lists - **Added missing thread cluster lengths**: Added X×Y×Z" display for both A and B tile transfer sections. - **Fixed typo**: "Do Padd Gemm" → "Do Pad Gemm" - **Fixed typo**: "scr" → "src" - **Fixed typo**: "tensros" → "tensors" - `ninja smoke-builder` ✓ - `ninja check-builder` ✓ The test file updates reflect the corrected expected output, which now shows the actual `lds_padding` values (0 or 1), shows thread cluster lenths, and omits the verbose "Struct does not contain..." lines. **Note**: This PR follows PR #5083.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
8c216604d4
commit
846bcacab4
@@ -85,19 +85,16 @@ class ConvDescription : public Description
|
||||
"×",
|
||||
traits_.tile_dims.k);
|
||||
if(traits_.gemm_padding)
|
||||
algo.add("Gemm padding: ",
|
||||
traits_.gemm_padding.value_or(builder::GemmPadding::DEFAULT));
|
||||
else
|
||||
algo.add("Struct does not contain optional gemm_padding argument");
|
||||
algo.add("Gemm padding: ", *traits_.gemm_padding);
|
||||
if(traits_.do_pad_gemm_m)
|
||||
algo.add("Do Padd Gemm M: ", traits_.do_pad_gemm_m.value_or(false));
|
||||
algo.add("Do Pad Gemm M: ", *traits_.do_pad_gemm_m);
|
||||
if(traits_.do_pad_gemm_n)
|
||||
algo.add("Do Padd Gemm N: ", traits_.do_pad_gemm_n.value_or(false));
|
||||
algo.add("Do Pad Gemm N: ", *traits_.do_pad_gemm_n);
|
||||
algo.add("Convolution specialization: ", traits_.conv_specialization);
|
||||
// Pipeline section
|
||||
algo.add("Pipeline version: ", traits_.pipeline_version);
|
||||
algo.add("Pipeline scheduler: ", traits_.pipeline_scheduler);
|
||||
auto& warpGemm = algo.add("Warp Gemm parameters: ");
|
||||
auto& warpGemm = algo.add("Warp Gemm parameters:");
|
||||
warpGemm.add("subtile size: ", traits_.warp_gemm.gemm_m, "×", traits_.warp_gemm.gemm_n);
|
||||
warpGemm.add("Number of warp gemm iterations: ",
|
||||
traits_.warp_gemm.m_iter,
|
||||
@@ -107,16 +104,21 @@ class ConvDescription : public Description
|
||||
// Memory Access section
|
||||
auto& memAccess = algo.add("Memory access:");
|
||||
|
||||
auto& aTile = memAccess.add("A Tile transfer: ");
|
||||
auto& aTile = memAccess.add("A Tile transfer:");
|
||||
aTile.add("Tile dimensions: ",
|
||||
traits_.a_tile_transfer.tile_dimensions.k0,
|
||||
"×",
|
||||
traits_.a_tile_transfer.tile_dimensions.m_or_n,
|
||||
"×",
|
||||
traits_.a_tile_transfer.tile_dimensions.k1,
|
||||
"×");
|
||||
traits_.a_tile_transfer.tile_dimensions.k1);
|
||||
aTile.add("The innermost K subdimension size: ",
|
||||
traits_.a_tile_transfer.transfer_params.k1);
|
||||
aTile.add("Thread cluster lengths (threads per axis): ",
|
||||
traits_.a_tile_transfer.transfer_params.thread_cluster_dims[0],
|
||||
"×",
|
||||
traits_.a_tile_transfer.transfer_params.thread_cluster_dims[1],
|
||||
"×",
|
||||
traits_.a_tile_transfer.transfer_params.thread_cluster_dims[2]);
|
||||
aTile.add("Spatial thread distribution over the data tile: ",
|
||||
traits_.a_tile_transfer.transfer_params.thread_cluster_order[0],
|
||||
"×",
|
||||
@@ -136,18 +138,23 @@ class ConvDescription : public Description
|
||||
aTile.add("Vector access (LDS write) instruction size: ",
|
||||
traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
|
||||
aTile.add("LDS data layout padding (to prevent bank conflicts): ",
|
||||
traits_.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
|
||||
traits_.a_tile_transfer.transfer_params.lds_padding);
|
||||
|
||||
auto& bTile = memAccess.add("B Tile transfer: ");
|
||||
auto& bTile = memAccess.add("B Tile transfer:");
|
||||
bTile.add("Tile dimensions: ",
|
||||
traits_.b_tile_transfer.tile_dimensions.k0,
|
||||
"×",
|
||||
traits_.b_tile_transfer.tile_dimensions.m_or_n,
|
||||
"×",
|
||||
traits_.b_tile_transfer.tile_dimensions.k1,
|
||||
"×");
|
||||
traits_.b_tile_transfer.tile_dimensions.k1);
|
||||
bTile.add("The innermost K subdimension size: ",
|
||||
traits_.b_tile_transfer.transfer_params.k1);
|
||||
bTile.add("Thread cluster lengths (threads per axis): ",
|
||||
traits_.b_tile_transfer.transfer_params.thread_cluster_dims[0],
|
||||
"×",
|
||||
traits_.b_tile_transfer.transfer_params.thread_cluster_dims[1],
|
||||
"×",
|
||||
traits_.b_tile_transfer.transfer_params.thread_cluster_dims[2]);
|
||||
bTile.add("Spatial thread distribution over the data tile: ",
|
||||
traits_.b_tile_transfer.transfer_params.thread_cluster_order[0],
|
||||
"×",
|
||||
@@ -167,9 +174,9 @@ class ConvDescription : public Description
|
||||
bTile.add("Vector access (LDS write) instruction size: ",
|
||||
traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
|
||||
bTile.add("LDS data layout padding (to prevent bank conflicts): ",
|
||||
traits_.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
|
||||
traits_.b_tile_transfer.transfer_params.lds_padding);
|
||||
|
||||
auto& cTile = memAccess.add("C Tile transfer: ");
|
||||
auto& cTile = memAccess.add("C Tile transfer:");
|
||||
cTile.add("Data shuffle (number of gemm instructions per iteration): ",
|
||||
traits_.c_tile_transfer.shuffle_params.m_gemms_per_shuffle,
|
||||
"×",
|
||||
@@ -185,27 +192,15 @@ class ConvDescription : public Description
|
||||
cTile.add("Vector access (GMEM write) instruction size: ",
|
||||
traits_.c_tile_transfer.scalar_per_vector);
|
||||
if(traits_.num_gemm_k_prefetch_stage)
|
||||
algo.add("Num gemm k prefetch stage: ", traits_.num_gemm_k_prefetch_stage.value_or(0));
|
||||
else
|
||||
algo.add("Struct does not contain optional "
|
||||
"num_gemm_k_prefetch_stage parameter");
|
||||
|
||||
algo.add("Num gemm k prefetch stage: ", *traits_.num_gemm_k_prefetch_stage);
|
||||
if(traits_.max_transpose_transfer_src_scalar_per_vector)
|
||||
algo.add("Max Transpose transfer scr scalar per vector: ",
|
||||
traits_.max_transpose_transfer_src_scalar_per_vector.value_or(0));
|
||||
else
|
||||
algo.add("Struct does not contain optional "
|
||||
"max_transpose_transfer_src_scalar_per_vector parameter");
|
||||
algo.add("Max Transpose transfer src scalar per vector: ",
|
||||
*traits_.max_transpose_transfer_src_scalar_per_vector);
|
||||
if(traits_.max_transpose_transfer_dst_scalar_per_vector)
|
||||
algo.add("Max Transpose dst scalar per vector: ",
|
||||
traits_.max_transpose_transfer_dst_scalar_per_vector.value_or(0));
|
||||
else
|
||||
algo.add("Struct does not contain optional "
|
||||
"max_transpose_transfer_dst_scalar_per_vector parameter");
|
||||
*traits_.max_transpose_transfer_dst_scalar_per_vector);
|
||||
if(traits_.num_groups_to_merge)
|
||||
algo.add("Num groups to merge: ", traits_.num_groups_to_merge.value_or(0));
|
||||
else
|
||||
algo.add("Struct does not contain optional num_groups_to_merge parameter");
|
||||
algo.add("Num groups to merge: ", *traits_.num_groups_to_merge);
|
||||
|
||||
return root.getString();
|
||||
}
|
||||
|
||||
@@ -73,7 +73,7 @@ namespace ck_tile::reflect::conv {
|
||||
// There is a lot we still need to do:
|
||||
//
|
||||
// TODO: Generalize type support for all tensors and accumulator.
|
||||
// TODO: Describe all tensros.
|
||||
// TODO: Describe all tensors.
|
||||
// TODO: Include the full generalization of the signature from the input schema.
|
||||
// TODO: Include the full generalization of the algorithm from the input schema.
|
||||
struct ConvTraits
|
||||
|
||||
@@ -260,57 +260,52 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
|
||||
static constexpr const ConvSignature SIGNATURE;
|
||||
static constexpr const DefaultAlgorithm ALGORITHM;
|
||||
using Instance = ckb::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
|
||||
EXPECT_THAT(
|
||||
ckr::describe<Instance>().detailed(),
|
||||
ckt::StringEqWithDiff( //
|
||||
"2D Forward Convolution Kernel\n"
|
||||
"├─ Signature\n"
|
||||
"│ ├─ Tensor Type: FP16\n"
|
||||
"│ ├─ Input Layout: GNHWC\n"
|
||||
"│ ├─ Weight Layout: GKYXC\n"
|
||||
"│ ├─ Output Layout: GNHWK\n"
|
||||
"│ ├─ Input elementwise operation: PASS_THROUGH\n"
|
||||
"│ ├─ Weights elementwise operation: PASS_THROUGH\n"
|
||||
"│ └─ Output elementwise operation: PASS_THROUGH\n"
|
||||
"└─ Algorithm\n"
|
||||
" ├─ Thread block size: 256\n"
|
||||
" ├─ Data tile size: 256×256×32\n"
|
||||
" ├─ Gemm padding: DEFAULT\n"
|
||||
" ├─ Convolution specialization: DEFAULT\n"
|
||||
" ├─ Pipeline version: V4\n"
|
||||
" ├─ Pipeline scheduler: INTRAWAVE\n"
|
||||
" ├─ Warp Gemm parameters: \n"
|
||||
" │ ├─ subtile size: 16×16\n"
|
||||
" │ └─ Number of warp gemm iterations: 8×8\n"
|
||||
" ├─ Memory access:\n"
|
||||
" │ ├─ A Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 4×256×8×\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 0×1×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 0×1×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 2\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 2\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 2\n"
|
||||
" │ ├─ B Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 4×256×8×\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 0×1×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 0×1×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 2\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 2\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 2\n"
|
||||
" │ └─ C Tile transfer: \n"
|
||||
" │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
|
||||
" │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
|
||||
" │ └─ Vector access (GMEM write) instruction size: 2\n"
|
||||
" ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n"
|
||||
" ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector "
|
||||
"parameter\n"
|
||||
" ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector "
|
||||
"parameter\n"
|
||||
" └─ Struct does not contain optional num_groups_to_merge parameter"));
|
||||
EXPECT_THAT(ckr::describe<Instance>().detailed(),
|
||||
ckt::StringEqWithDiff( //
|
||||
"2D Forward Convolution Kernel\n"
|
||||
"├─ Signature\n"
|
||||
"│ ├─ Tensor Type: FP16\n"
|
||||
"│ ├─ Input Layout: GNHWC\n"
|
||||
"│ ├─ Weight Layout: GKYXC\n"
|
||||
"│ ├─ Output Layout: GNHWK\n"
|
||||
"│ ├─ Input elementwise operation: PASS_THROUGH\n"
|
||||
"│ ├─ Weights elementwise operation: PASS_THROUGH\n"
|
||||
"│ └─ Output elementwise operation: PASS_THROUGH\n"
|
||||
"└─ Algorithm\n"
|
||||
" ├─ Thread block size: 256\n"
|
||||
" ├─ Data tile size: 256×256×32\n"
|
||||
" ├─ Gemm padding: DEFAULT\n"
|
||||
" ├─ Convolution specialization: DEFAULT\n"
|
||||
" ├─ Pipeline version: V4\n"
|
||||
" ├─ Pipeline scheduler: INTRAWAVE\n"
|
||||
" ├─ Warp Gemm parameters:\n"
|
||||
" │ ├─ subtile size: 16×16\n"
|
||||
" │ └─ Number of warp gemm iterations: 8×8\n"
|
||||
" └─ Memory access:\n"
|
||||
" ├─ A Tile transfer:\n"
|
||||
" │ ├─ Tile dimensions: 4×256×8\n"
|
||||
" │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ ├─ Thread cluster lengths (threads per axis): 1×128×2\n"
|
||||
" │ ├─ Spatial thread distribution over the data tile: 0×1×2\n"
|
||||
" │ ├─ The order of accessing data tile axes: 0×1×2\n"
|
||||
" │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ ├─ Vector access (GMEM read) instruction size: 2\n"
|
||||
" │ ├─ Vector access (LDS write) instruction size: 2\n"
|
||||
" │ └─ LDS data layout padding (to prevent bank conflicts): 0\n"
|
||||
" ├─ B Tile transfer:\n"
|
||||
" │ ├─ Tile dimensions: 4×256×8\n"
|
||||
" │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ ├─ Thread cluster lengths (threads per axis): 1×128×2\n"
|
||||
" │ ├─ Spatial thread distribution over the data tile: 0×1×2\n"
|
||||
" │ ├─ The order of accessing data tile axes: 0×1×2\n"
|
||||
" │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ ├─ Vector access (GMEM read) instruction size: 2\n"
|
||||
" │ ├─ Vector access (LDS write) instruction size: 2\n"
|
||||
" │ └─ LDS data layout padding (to prevent bank conflicts): 0\n"
|
||||
" └─ C Tile transfer:\n"
|
||||
" ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
|
||||
" ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
|
||||
" └─ Vector access (GMEM write) instruction size: 2"));
|
||||
}
|
||||
|
||||
// Test printing of optional parameters num_groups_to_merge,
|
||||
@@ -384,38 +379,38 @@ TEST(ConvDescriptionTest, BwdWeightTwoStageWmmaV3DescriptionTest)
|
||||
"└─ Algorithm\n"
|
||||
" ├─ Thread block size: 256\n"
|
||||
" ├─ Data tile size: 128×128×16\n"
|
||||
" ├─ Struct does not contain optional gemm_padding argument\n"
|
||||
" ├─ Convolution specialization: DEFAULT\n"
|
||||
" ├─ Pipeline version: V1\n"
|
||||
" ├─ Pipeline scheduler: DEFAULT\n"
|
||||
" ├─ Warp Gemm parameters: \n"
|
||||
" ├─ Warp Gemm parameters:\n"
|
||||
" │ ├─ subtile size: 32×32\n"
|
||||
" │ └─ Number of warp gemm iterations: 4×4\n"
|
||||
" ├─ Memory access:\n"
|
||||
" │ ├─ A Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8×\n"
|
||||
" │ ├─ A Tile transfer:\n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n"
|
||||
" │ ├─ B Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8×\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n"
|
||||
" │ ├─ B Tile transfer:\n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n"
|
||||
" │ └─ C Tile transfer: \n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n"
|
||||
" │ └─ C Tile transfer:\n"
|
||||
" │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
|
||||
" │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
|
||||
" │ └─ Vector access (GMEM write) instruction size: 8\n"
|
||||
" ├─ Struct does not contain optional num_gemm_k_prefetch_stage parameter\n"
|
||||
" ├─ Max Transpose transfer scr scalar per vector: 1\n"
|
||||
" ├─ Max Transpose transfer src scalar per vector: 1\n"
|
||||
" ├─ Max Transpose dst scalar per vector: 1\n"
|
||||
" └─ Num groups to merge: 4"));
|
||||
}
|
||||
@@ -473,57 +468,52 @@ TEST(ConvDescriptionTest, BwdWeightWmmaCshuffleV3DescriptionTest)
|
||||
ck::PipelineVersion::v1, // BlkGemmPipelineVer
|
||||
false>; // BComputeDataType
|
||||
|
||||
EXPECT_THAT(
|
||||
ckr::describe<Instance>().detailed(),
|
||||
ckt::StringEqWithDiff( //
|
||||
"3D Backward Weight Convolution Kernel\n"
|
||||
"├─ Signature\n"
|
||||
"│ ├─ Tensor Type: FP16\n"
|
||||
"│ ├─ Input Layout: GNDHWC\n"
|
||||
"│ ├─ Weight Layout: GKZYXC\n"
|
||||
"│ ├─ Output Layout: GNDHWK\n"
|
||||
"│ ├─ Input elementwise operation: PASS_THROUGH\n"
|
||||
"│ ├─ Weights elementwise operation: PASS_THROUGH\n"
|
||||
"│ └─ Output elementwise operation: PASS_THROUGH\n"
|
||||
"└─ Algorithm\n"
|
||||
" ├─ Thread block size: 256\n"
|
||||
" ├─ Data tile size: 128×128×16\n"
|
||||
" ├─ Struct does not contain optional gemm_padding argument\n"
|
||||
" ├─ Convolution specialization: DEFAULT\n"
|
||||
" ├─ Pipeline version: V1\n"
|
||||
" ├─ Pipeline scheduler: DEFAULT\n"
|
||||
" ├─ Warp Gemm parameters: \n"
|
||||
" │ ├─ subtile size: 32×32\n"
|
||||
" │ └─ Number of warp gemm iterations: 4×4\n"
|
||||
" ├─ Memory access:\n"
|
||||
" │ ├─ A Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8×\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n"
|
||||
" │ ├─ B Tile transfer: \n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8×\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 8\n"
|
||||
" │ └─ C Tile transfer: \n"
|
||||
" │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
|
||||
" │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
|
||||
" │ └─ Vector access (GMEM write) instruction size: 8\n"
|
||||
" ├─ Num gemm k prefetch stage: 1\n"
|
||||
" ├─ Struct does not contain optional max_transpose_transfer_src_scalar_per_vector "
|
||||
"parameter\n"
|
||||
" ├─ Struct does not contain optional max_transpose_transfer_dst_scalar_per_vector "
|
||||
"parameter\n"
|
||||
" └─ Struct does not contain optional num_groups_to_merge parameter"));
|
||||
EXPECT_THAT(ckr::describe<Instance>().detailed(),
|
||||
ckt::StringEqWithDiff( //
|
||||
"3D Backward Weight Convolution Kernel\n"
|
||||
"├─ Signature\n"
|
||||
"│ ├─ Tensor Type: FP16\n"
|
||||
"│ ├─ Input Layout: GNDHWC\n"
|
||||
"│ ├─ Weight Layout: GKZYXC\n"
|
||||
"│ ├─ Output Layout: GNDHWK\n"
|
||||
"│ ├─ Input elementwise operation: PASS_THROUGH\n"
|
||||
"│ ├─ Weights elementwise operation: PASS_THROUGH\n"
|
||||
"│ └─ Output elementwise operation: PASS_THROUGH\n"
|
||||
"└─ Algorithm\n"
|
||||
" ├─ Thread block size: 256\n"
|
||||
" ├─ Data tile size: 128×128×16\n"
|
||||
" ├─ Convolution specialization: DEFAULT\n"
|
||||
" ├─ Pipeline version: V1\n"
|
||||
" ├─ Pipeline scheduler: DEFAULT\n"
|
||||
" ├─ Warp Gemm parameters:\n"
|
||||
" │ ├─ subtile size: 32×32\n"
|
||||
" │ └─ Number of warp gemm iterations: 4×4\n"
|
||||
" ├─ Memory access:\n"
|
||||
" │ ├─ A Tile transfer:\n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n"
|
||||
" │ ├─ B Tile transfer:\n"
|
||||
" │ │ ├─ Tile dimensions: 2×128×8\n"
|
||||
" │ │ ├─ The innermost K subdimension size: 8\n"
|
||||
" │ │ ├─ Thread cluster lengths (threads per axis): 4×64×1\n"
|
||||
" │ │ ├─ Spatial thread distribution over the data tile: 1×0×2\n"
|
||||
" │ │ ├─ The order of accessing data tile axes: 1×0×2\n"
|
||||
" │ │ ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
|
||||
" │ │ ├─ Vector access (GMEM read) instruction size: 8\n"
|
||||
" │ │ ├─ Vector access (LDS write) instruction size: 8\n"
|
||||
" │ │ └─ LDS data layout padding (to prevent bank conflicts): 1\n"
|
||||
" │ └─ C Tile transfer:\n"
|
||||
" │ ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
|
||||
" │ ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
|
||||
" │ └─ Vector access (GMEM write) instruction size: 8\n"
|
||||
" └─ Num gemm k prefetch stage: 1"));
|
||||
}
|
||||
|
||||
TEST(ConvDescriptionTest, DefaultInstanceHasInstanceString)
|
||||
|
||||
Reference in New Issue
Block a user