Generate output using Doxygen / Breathe (#598)

* Modify Doxygen config to pick up include directories recursively

* Add DeviceMem struct to API Reference guide

* Add classes that are used in Flash Attention kernel

* Add a reference and config for generating bibliography

Co-authored-by: Philip Maybank <Philip.Maybank@amd.com>

[ROCm/composable_kernel commit: e4bf6d422e]
This commit is contained in:
pmaybank
2023-03-06 17:39:16 +00:00
committed by GitHub
parent ef3be1d9cf
commit 9080b984cb
10 changed files with 94 additions and 20 deletions

View File

@@ -775,8 +775,10 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = ../library/include \
../library/include/internal
INPUT = ../include/ck/tensor_operation/gpu/grid \
../include/ck/tensor_operation/gpu/block \
../include/ck/tensor_operation/gpu/thread \
../library/include/ck/library/utility
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -845,7 +847,7 @@ FILE_PATTERNS = *.c \
# be searched for input files as well.
# The default value is: NO.
RECURSIVE = NO
RECURSIVE = YES
# The EXCLUDE tag can be used to specify files and/or directories that should be
# excluded from the INPUT source files. This way you can easily exclude a

View File

@@ -1,11 +1,11 @@
===================
*******************
API Reference Guide
===================
*******************
------------
=================
Introduction
------------
=================
This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
principles that are used to write new classes that extend CK functionality.
@@ -16,8 +16,37 @@ Using CK API
This section describes how to use the CK library API.
-----------------
=================
CK Datatypes
=================
-----------------
DeviceMem
-----------------
[TODO]
.. doxygenstruct:: DeviceMem
---------------------------
Kernels For Flashattention
---------------------------
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists the classes that are
used in the CK GPU implementation of Flashattention.
**Gridwise classes**
.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
**Blockwise classes**
.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
.. doxygenstruct:: ck::BlockwiseSoftmax
**Threadwise classes**
.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
.. bibliography::

View File

@@ -59,10 +59,13 @@ if read_the_docs_build:
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.mathjax', 'breathe']
extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex']
breathe_projects = { "CK": "../docBin/xml" }
breathe_default_project = "CK"
bibtex_bibfiles = ['refs.bib']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

7
docs/source/refs.bib Normal file
View File

@@ -0,0 +1,7 @@
@article{dao2022flashattention,
title={Flashattention: Fast and memory-efficient exact attention with io-awareness},
author={Dao, Tri and Fu, Daniel Y and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
journal={arXiv preprint arXiv:2205.14135},
year={2022}
}

View File

@@ -622,11 +622,16 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
}
};
// Blockwise gemm supporting
// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
// source buffer
// 3. configurable k index starting position and step size after each FMA/XDL instruction
/**
* @brief Blockwise gemm
*
* Supports
* 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
* 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
* source buffer
* 3. configurable k index starting position and step size after each FMA/XDL instruction
*/
template <index_t BlockSize,
typename FloatAB,
typename FloatAcc,

View File

@@ -12,6 +12,16 @@
namespace ck {
/**
* @brief Blockwise softmax
*
* @tparam BlockSize
* @tparam AccDataType
* @tparam ThreadMap_M_K
* @tparam ThreadClusterDesc_M_K
* @tparam ThreadSliceDesc_M_K
* @tparam IgnoreNaN
*/
template <index_t BlockSize,
typename AccDataType,
typename ThreadMap_M_K, // thread_id to m_k

View File

@@ -11,10 +11,15 @@
namespace ck {
// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
/**
* @brief Blockwise data transfer
*
* This version does following things to avoid scratch memory issue
* 1. Use StaticallyIndexedArray instead of C array for thread buffer
* 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
* 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
*
*/
template <typename ThreadGroup,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,

View File

@@ -18,6 +18,10 @@
namespace ck {
/**
* @brief Gridwise gemm + softmax + gemm fusion
*
*/
template <typename FloatAB,
typename FloatGemmAcc,
typename FloatCShuffle,

View File

@@ -1201,7 +1201,12 @@ struct ThreadwiseTensorSliceTransfer_v4
SrcCoord src_ref_coord_;
};
// Do NOT involve any tensor coordinates with StaticBuffer
/**
* @brief Threadwise data transfer
*
* Do NOT involve any tensor coordinates with StaticBuffer
*
*/
template <typename SrcData,
typename DstData,
typename SrcDesc,

View File

@@ -14,6 +14,10 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
}
}
/**
* @brief Container for storing data in GPU device memory
*
*/
struct DeviceMem
{
DeviceMem() = delete;