add another Xdl policy and improve indexing - 1

This commit is contained in:
Philip Maybank
2025-11-17 11:48:06 +00:00
parent 3c270494dd
commit 27eb3b347d
3 changed files with 83 additions and 22 deletions

View File

@@ -17,9 +17,20 @@ GEMM Pipelines defined in ``include/ck/tensor_operation/gpu/block`` directory.
gemm(i)
lds_write(i+1)
:ref:`BlockwiseGemmXdlops_pipeline_v2 <xdl-v2>`
.. code-block::
buffer_load(0)
lds_write(0)
buffer_load(1:prefetch)
lds_read(i)
gemm(i)
lds_write(i+1)
buffer_load(i+prefetch)
**TODO**
- BlockwiseGemmXdlops_pipeline_v2
- BlockwiseGemmXdlops_pipeline_v3
CK_TILE GEMM pipelines
@@ -41,4 +52,23 @@ Pipelines defined in ``include/ck_tile/ops/gemm/pipeline`` directory.
**TODO**
- GemmPipelineAgBgCrMem
- GemmPipelineAgBgCrCompV3
- GemmPipelineAgBgCrCompV3
Documentation - Xdl Pipelines
-------------------------------
.. toctree::
:maxdepth: 2
:caption: Xdl Pipelines
xdl-v1
xdl-v2
Documentation - CK_TILE Pipelines
-------------------------------
.. toctree::
:maxdepth: 2
:caption: CK Tile Pipelines
ck_tile-v1

View File

@@ -7,29 +7,58 @@ Xdl - v1 - Intrawave
.. code-block::
static_for<0, KRepeat, 1>{}([&](auto k) {
static_for<0, MRepeat, 1>{}([&](auto m0) {
a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
a_block_buf,
a_thread_desc_,
make_tuple(m0, I0, k, I0),
a_thread_buf);
static_for<0, NRepeat, 1>{}([&](auto n0) {
b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
b_block_buf,
b_thread_desc_,
make_tuple(n0, I0, k, I0),
b_thread_buf);
// Global prefetch 1
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
// Local prefill 1
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
// Initialize C
c_thread_buf.Clear();
// main body
if constexpr(HasMainLoop)
{
index_t i = 0;
do
{
// -------------------------------------------------------------------------------------------
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
block_sync_lds();
static_for<0, KRepeat, 1>{}([&](auto k) {
static_for<0, MRepeat, 1>{}([&](auto m0) {
a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
a_block_buf,
a_thread_desc_,
make_tuple(m0, I0, k, I0),
a_thread_buf);
static_for<0, NRepeat, 1>{}([&](auto n0) {
b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
b_block_buf,
b_thread_desc_,
make_tuple(n0, I0, k, I0),
b_thread_buf);
});
});
});
});
});
The ``a_thread_copy_.Run`` function is a member of the ``ThreadwiseTensorSliceTransfer_v4`` class. The ``ThreadwiseTensorSliceTransfer_v4`` class uses,
..
The ``a_thread_copy_.Run`` function is a member of the ``ThreadwiseTensorSliceTransfer_v4`` class. The ``ThreadwiseTensorSliceTransfer_v4`` class uses,
- ``make_tensor_coordinate`` from ``tensor_descriptor.hpp``
- ``make_naive_tensor_descriptor_packed`` from ``tensor_descriptor_helper.hpp``
- ``make_tensor_coordinate`` from ``tensor_descriptor.hpp``
- ``make_naive_tensor_descriptor_packed`` from ``tensor_descriptor_helper.hpp``
**full class definition**

View File

@@ -34,6 +34,8 @@ subtrees:
title: Composable Kernel vector utilities
- file: reference/Composable-Kernel-wrapper.rst
title: Composable Kernel wrapper
- file: reference/pipelines/index.rst
title: Pipeline Policies
- file: doxygen/html/namespace_c_k.rst
title: CK API reference
- file: doxygen/html/namespaceck__tile.rst