mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Generate output using Doxygen / Breathe (#598)
* Modify Doxygen config to pick up include directories recursively
* Add DeviceMem struct to API Reference guide
* Add classes that are used in Flash Attention kernel
* Add a reference and config for generating bibliography
Co-authored-by: Philip Maybank <Philip.Maybank@amd.com>
[ROCm/composable_kernel commit: e4bf6d422e]
This commit is contained in:
@@ -775,8 +775,10 @@ WARN_LOGFILE =
|
||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = ../library/include \
|
||||
../library/include/internal
|
||||
INPUT = ../include/ck/tensor_operation/gpu/grid \
|
||||
../include/ck/tensor_operation/gpu/block \
|
||||
../include/ck/tensor_operation/gpu/thread \
|
||||
../library/include/ck/library/utility
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
@@ -845,7 +847,7 @@ FILE_PATTERNS = *.c \
|
||||
# be searched for input files as well.
|
||||
# The default value is: NO.
|
||||
|
||||
RECURSIVE = NO
|
||||
RECURSIVE = YES
|
||||
|
||||
# The EXCLUDE tag can be used to specify files and/or directories that should be
|
||||
# excluded from the INPUT source files. This way you can easily exclude a
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
|
||||
===================
|
||||
*******************
|
||||
API Reference Guide
|
||||
===================
|
||||
*******************
|
||||
|
||||
------------
|
||||
=================
|
||||
Introduction
|
||||
------------
|
||||
=================
|
||||
|
||||
This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
|
||||
principles that are used to write new classes that extend CK functionality.
|
||||
@@ -16,8 +16,37 @@ Using CK API
|
||||
|
||||
This section describes how to use the CK library API.
|
||||
|
||||
-----------------
|
||||
=================
|
||||
CK Datatypes
|
||||
=================
|
||||
|
||||
-----------------
|
||||
DeviceMem
|
||||
-----------------
|
||||
|
||||
[TODO]
|
||||
.. doxygenstruct:: DeviceMem
|
||||
|
||||
---------------------------
|
||||
Kernels For Flashattention
|
||||
---------------------------
|
||||
|
||||
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists the classes that are
|
||||
used in the CK GPU implementation of Flashattention.
|
||||
|
||||
**Gridwise classes**
|
||||
|
||||
.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
|
||||
|
||||
**Blockwise classes**
|
||||
|
||||
.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
|
||||
|
||||
.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
|
||||
|
||||
.. doxygenstruct:: ck::BlockwiseSoftmax
|
||||
|
||||
**Threadwise classes**
|
||||
|
||||
.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
|
||||
|
||||
.. bibliography::
|
||||
@@ -59,10 +59,13 @@ if read_the_docs_build:
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = ['sphinx.ext.mathjax', 'breathe']
|
||||
extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex']
|
||||
|
||||
breathe_projects = { "CK": "../docBin/xml" }
|
||||
breathe_default_project = "CK"
|
||||
|
||||
bibtex_bibfiles = ['refs.bib']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
|
||||
7
docs/source/refs.bib
Normal file
7
docs/source/refs.bib
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
@article{dao2022flashattention,
|
||||
title={Flashattention: Fast and memory-efficient exact attention with io-awareness},
|
||||
author={Dao, Tri and Fu, Daniel Y and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
|
||||
journal={arXiv preprint arXiv:2205.14135},
|
||||
year={2022}
|
||||
}
|
||||
@@ -622,11 +622,16 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
|
||||
}
|
||||
};
|
||||
|
||||
// Blockwise gemm supporting
|
||||
// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
|
||||
// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
|
||||
// source buffer
|
||||
// 3. configurable k index starting position and step size after each FMA/XDL instruction
|
||||
/**
|
||||
* @brief Blockwise gemm
|
||||
*
|
||||
* Supports
|
||||
* 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
|
||||
* 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
|
||||
* source buffer
|
||||
* 3. configurable k index starting position and step size after each FMA/XDL instruction
|
||||
*/
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
|
||||
@@ -12,6 +12,16 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
/**
|
||||
* @brief Blockwise softmax
|
||||
*
|
||||
* @tparam BlockSize
|
||||
* @tparam AccDataType
|
||||
* @tparam ThreadMap_M_K
|
||||
* @tparam ThreadClusterDesc_M_K
|
||||
* @tparam ThreadSliceDesc_M_K
|
||||
* @tparam IgnoreNaN
|
||||
*/
|
||||
template <index_t BlockSize,
|
||||
typename AccDataType,
|
||||
typename ThreadMap_M_K, // thread_id to m_k
|
||||
|
||||
@@ -11,10 +11,15 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
// this version does following things to avoid scratch memory issue
|
||||
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
|
||||
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
|
||||
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
|
||||
/**
|
||||
* @brief Blockwise data transfer
|
||||
*
|
||||
* This version does following things to avoid scratch memory issue
|
||||
* 1. Use StaticallyIndexedArray instead of C array for thread buffer
|
||||
* 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
|
||||
* 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
|
||||
*
|
||||
*/
|
||||
template <typename ThreadGroup,
|
||||
typename SrcElementwiseOperation,
|
||||
typename DstElementwiseOperation,
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
/**
|
||||
* @brief Gridwise gemm + softmax + gemm fusion
|
||||
*
|
||||
*/
|
||||
template <typename FloatAB,
|
||||
typename FloatGemmAcc,
|
||||
typename FloatCShuffle,
|
||||
|
||||
@@ -1201,7 +1201,12 @@ struct ThreadwiseTensorSliceTransfer_v4
|
||||
SrcCoord src_ref_coord_;
|
||||
};
|
||||
|
||||
// Do NOT involve any tensor coordinates with StaticBuffer
|
||||
/**
|
||||
* @brief Threadwise data transfer
|
||||
*
|
||||
* Do NOT involve any tensor coordinates with StaticBuffer
|
||||
*
|
||||
*/
|
||||
template <typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
|
||||
@@ -14,6 +14,10 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Container for storing data in GPU device memory
|
||||
*
|
||||
*/
|
||||
struct DeviceMem
|
||||
{
|
||||
DeviceMem() = delete;
|
||||
|
||||
Reference in New Issue
Block a user