mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
[CK-Tile] Enable vectorized reads on all layouts & improve perf. (#1835)
* Refactor universal gemm policy. * Adapt example to refactor changes. * Introduce static encoding pattern * Adding shuffled encoding patterns. * Fix err in reverse tuple. * Add transpose_tile2d * Small refactoring + doc * Enable reading on contiguous dimension in all layouts. * Transpose A/B register tile if needed for comp v3 pipeline. * Take contiguous dim size when calculating dram vector load size. * A/B smem pack size taken from WarpGemm attributes * Update B LDS layout and setup tile distribution pattern at class level. * Fix static assert. * Fix errors in examples. * Formatting & fix IsTranspose * Fix VectorSize & refactor. * Add error loging messages. * Fix VecLoadSize and TranspseC for mem pipeline. * Update unit-tests & disable mem pipeline. * Clang format * Update include/ck_tile/core/tensor/tile_window.hpp Co-authored-by: jakpiase <jakub.piasecki@amd.com> * Fix compilation and reviewers comments. * Refactor unit-test. Fallback to non-universal gemm. Need to use GemmPipelineAGmemBGmemCRegV1 for now, since GemmKernel is now supporting also non-K major vector reads. --------- Co-authored-by: jakpiase <jakub.piasecki@amd.com>
This commit is contained in:
@@ -88,12 +88,9 @@ using CodegenPipelineProblem =
|
||||
CodegenGemmShape,
|
||||
CodegenGemmTraits<ALayout, BLayout, CLayout>>;
|
||||
|
||||
using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
|
||||
|
||||
template <typename ALayout, typename BLayout, typename CLayout>
|
||||
using CodegenGemmPipeline =
|
||||
ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>,
|
||||
CodegenGemmPolicy>;
|
||||
ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;
|
||||
|
||||
template <typename ALayout, typename BLayout, typename CLayout>
|
||||
using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
|
||||
|
||||
@@ -41,7 +41,7 @@ auto create_args(int argc, char* argv[])
|
||||
.insert("stride_Bs", "", "Tensor B strides - it is empty by default.")
|
||||
.insert("stride_Cs", "", "Tensor C strides - it is empty by default.")
|
||||
.insert("a_layout", "R", "A tensor data layout - Row by default.")
|
||||
.insert("b_layout", "R", "B tensor data layout - Row by default.")
|
||||
.insert("b_layout", "C", "B tensor data layout - Row by default.")
|
||||
.insert("c_layout", "R", "C tensor data layout - Row by default.")
|
||||
.insert("validate", "1", "0. No validation, 1. Validation on CPU.")
|
||||
.insert("warmup", "10", "number of iterations before benchmark the kernel.")
|
||||
|
||||
@@ -135,12 +135,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
|
||||
const ck_tile::index_t N = Ns[i];
|
||||
const ck_tile::index_t K = Ks[i];
|
||||
|
||||
stride_As[i] =
|
||||
ck_tile::get_default_stride(M, N, stride_As[i], is_row_major(a_layout));
|
||||
stride_Bs[i] =
|
||||
ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
|
||||
stride_Cs[i] =
|
||||
ck_tile::get_default_stride(M, N, stride_Cs[i], is_row_major(CLayout{}));
|
||||
stride_As[i] = ck_tile::get_default_stride(M, N, stride_As[i], is_row_major(a_layout));
|
||||
stride_Bs[i] = ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
|
||||
stride_Cs[i] = ck_tile::get_default_stride(M, N, stride_Cs[i], is_row_major(CLayout{}));
|
||||
|
||||
a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
|
||||
ck_tile::host_tensor_descriptor(M, K, stride_As[i], is_row_major(a_layout))));
|
||||
@@ -229,10 +226,10 @@ int run_grouped_gemm_example(int argc, char* argv[])
|
||||
{
|
||||
return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
|
||||
}
|
||||
else if(a_layout == "R" && b_layout == "R")
|
||||
{
|
||||
return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
|
||||
}
|
||||
// else if(a_layout == "R" && b_layout == "R")
|
||||
// {
|
||||
// return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
|
||||
// }
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
|
||||
|
||||
Reference in New Issue
Block a user