mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 19:57:40 +00:00
add another Xdl policy and improve indexing - 1
This commit is contained in:
@@ -17,9 +17,20 @@ GEMM Pipelines defined in ``include/ck/tensor_operation/gpu/block`` directory.
|
||||
gemm(i)
|
||||
lds_write(i+1)
|
||||
|
||||
:ref:`BlockwiseGemmXdlops_pipeline_v2 <xdl-v2>`
|
||||
|
||||
.. code-block::
|
||||
|
||||
buffer_load(0)
|
||||
lds_write(0)
|
||||
buffer_load(1:prefetch)
|
||||
lds_read(i)
|
||||
gemm(i)
|
||||
lds_write(i+1)
|
||||
buffer_load(i+prefetch)
|
||||
|
||||
**TODO**
|
||||
|
||||
- BlockwiseGemmXdlops_pipeline_v2
|
||||
- BlockwiseGemmXdlops_pipeline_v3
|
||||
|
||||
CK_TILE GEMM pipelines
|
||||
@@ -41,4 +52,23 @@ Pipelines defined in ``include/ck_tile/ops/gemm/pipeline`` directory.
|
||||
**TODO**
|
||||
|
||||
- GemmPipelineAgBgCrMem
|
||||
- GemmPipelineAgBgCrCompV3
|
||||
- GemmPipelineAgBgCrCompV3
|
||||
|
||||
Documentation - Xdl Pipelines
|
||||
-------------------------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Xdl Pipelines
|
||||
|
||||
xdl-v1
|
||||
xdl-v2
|
||||
|
||||
Documentation - CK_TILE Pipelines
|
||||
-------------------------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: CK Tile Pipelines
|
||||
|
||||
ck_tile-v1
|
||||
@@ -7,29 +7,58 @@ Xdl - v1 - Intrawave
|
||||
|
||||
.. code-block::
|
||||
|
||||
static_for<0, KRepeat, 1>{}([&](auto k) {
|
||||
static_for<0, MRepeat, 1>{}([&](auto m0) {
|
||||
a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
|
||||
make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
|
||||
a_block_buf,
|
||||
a_thread_desc_,
|
||||
make_tuple(m0, I0, k, I0),
|
||||
a_thread_buf);
|
||||
static_for<0, NRepeat, 1>{}([&](auto n0) {
|
||||
b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
|
||||
make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
|
||||
b_block_buf,
|
||||
b_thread_desc_,
|
||||
make_tuple(n0, I0, k, I0),
|
||||
b_thread_buf);
|
||||
// Global prefetch 1
|
||||
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
|
||||
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
|
||||
|
||||
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
|
||||
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
|
||||
|
||||
// Local prefill 1
|
||||
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
|
||||
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
|
||||
|
||||
// Initialize C
|
||||
c_thread_buf.Clear();
|
||||
|
||||
// main body
|
||||
if constexpr(HasMainLoop)
|
||||
{
|
||||
index_t i = 0;
|
||||
do
|
||||
{
|
||||
// -------------------------------------------------------------------------------------------
|
||||
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
|
||||
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
|
||||
|
||||
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
|
||||
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
|
||||
|
||||
block_sync_lds();
|
||||
static_for<0, KRepeat, 1>{}([&](auto k) {
|
||||
static_for<0, MRepeat, 1>{}([&](auto m0) {
|
||||
a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
|
||||
make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
|
||||
a_block_buf,
|
||||
a_thread_desc_,
|
||||
make_tuple(m0, I0, k, I0),
|
||||
a_thread_buf);
|
||||
static_for<0, NRepeat, 1>{}([&](auto n0) {
|
||||
b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
|
||||
make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
|
||||
b_block_buf,
|
||||
b_thread_desc_,
|
||||
make_tuple(n0, I0, k, I0),
|
||||
b_thread_buf);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
The ``a_thread_copy_.Run`` function is a member of the ``ThreadwiseTensorSliceTransfer_v4`` class. The ``ThreadwiseTensorSliceTransfer_v4`` class uses,
|
||||
..
|
||||
The ``a_thread_copy_.Run`` function is a member of the ``ThreadwiseTensorSliceTransfer_v4`` class. The ``ThreadwiseTensorSliceTransfer_v4`` class uses,
|
||||
|
||||
- ``make_tensor_coordinate`` from ``tensor_descriptor.hpp``
|
||||
- ``make_naive_tensor_descriptor_packed`` from ``tensor_descriptor_helper.hpp``
|
||||
- ``make_tensor_coordinate`` from ``tensor_descriptor.hpp``
|
||||
- ``make_naive_tensor_descriptor_packed`` from ``tensor_descriptor_helper.hpp``
|
||||
|
||||
**full class definition**
|
||||
|
||||
|
||||
@@ -34,6 +34,8 @@ subtrees:
|
||||
title: Composable Kernel vector utilities
|
||||
- file: reference/Composable-Kernel-wrapper.rst
|
||||
title: Composable Kernel wrapper
|
||||
- file: reference/pipelines/index.rst
|
||||
title: Pipeline Policies
|
||||
- file: doxygen/html/namespace_c_k.rst
|
||||
title: CK API reference
|
||||
- file: doxygen/html/namespaceck__tile.rst
|
||||
|
||||
Reference in New Issue
Block a user