Merge commit 'cfae8634313f804593d123b3ec51a43319f5fab1' into develop

This commit is contained in:
github-actions[bot]
2025-04-30 22:39:56 +00:00
parent 1eb2e57380
commit f675bb9252
29 changed files with 62 additions and 138 deletions

View File

@@ -42,19 +42,19 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
PROJECT_NAME = "ck"
PROJECT_NAME = "Composable Kernel"
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
# control system is used.
PROJECT_NUMBER = v3.0.1.0
PROJECT_NUMBER =
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HIP"
PROJECT_BRIEF = "Prototype interfaces compatible with ROCm platform and HiP"
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
@@ -949,8 +949,8 @@ INPUT = ../../include/ck/tensor_operation/gpu/grid \
../../include/ck/tensor_operation/gpu/block \
../../include/ck/tensor_operation/gpu/thread \
../../library/include/ck/library/utility \
../../include/ck/wrapper
../../include/ck/wrapper \
../../include/ck_tile
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1161,7 +1161,8 @@ FILTER_SOURCE_PATTERNS =
# (index.html). This can be useful if you have a project on for instance GitHub
# and want to reuse the introduction page also for the doxygen output.
USE_MDFILE_AS_MAINPAGE = ../../README.md
USE_MDFILE_AS_MAINPAGE =
# The Fortran standard specifies that for fixed formatted Fortran code all
# characters from position 72 are to be considered as comment. A common
@@ -1370,7 +1371,7 @@ HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css
# files will be copied as-is; there are no commands or markers available.
# This tag requires that the tag GENERATE_HTML is set to YES.
HTML_EXTRA_FILES =
HTML_EXTRA_FILES = ../_doxygen/extra_stylesheet.css
# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
# should be rendered with a dark or light theme.

View File

@@ -35,9 +35,9 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
* :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>`
* :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
* :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
* :ref:`api-reference`
* :ref:`wrapper`
* :ref:`wrapper`
* :doc:`Composable Kernel complete class list <./doxygen/html/annotated>`
To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.

View File

@@ -1,42 +0,0 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _api-reference:
********************************************************************
Composable Kernel API reference guide
********************************************************************
This document contains details of the APIs for the Composable Kernel library and introduces some of the key design principles that are used to write new classes that extend the functionality of the Composable Kernel library.
=================
DeviceMem
=================
.. doxygenstruct:: DeviceMem
=============================
Kernels For Flashattention
=============================
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
the classes that are used in the CK GPU implementation of Flashattention.
**Gridwise classes**
.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
**Blockwise classes**
.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
.. doxygenstruct:: ck::BlockwiseSoftmax
**Threadwise classes**
.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
.. bibliography::

View File

@@ -32,10 +32,10 @@ subtrees:
title: Composable Kernel custom types
- file: reference/Composable_Kernel_vector_utilities.rst
title: Composable Kernel vector utilities
- file: reference/Composable-Kernel-API-reference.rst
title: Composable Kernel API reference
- file: reference/Composable-Kernel-wrapper.rst
title: Composable Kernel Wrapper
title: Composable Kernel wrapper
- file: doxygen/html/annotated.rst
title: Composable Kernel class list
- caption: About
entries:

View File

@@ -5,6 +5,7 @@ include_directories(BEFORE
add_custom_target(examples)
# list of examples that are labelled as REGRESSION_EXAMPLE for make regression (runtime more than 30 seconds)
# all other tests are labelled as SMOKE_EXAMPLE
set(REGRESSION_EXAMPLES
@@ -231,9 +232,6 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME)
# add all example subdir
file(GLOB dir_list LIST_DIRECTORIES true *)
if (NOT SUPPORTED_GPU_TARGETS MATCHES "gfx9")
list(FILTER dir_list EXCLUDE REGEX ".*/ck_tile")
endif()
FOREACH(subdir ${dir_list})
if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
add_subdirectory(${subdir})

View File

@@ -58,8 +58,7 @@ set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
message("adding example ${EXAMPLE_FMHA_FWD}")
add_executable(${EXAMPLE_FMHA_FWD} fmha_fwd.cpp)
rocm_install(TARGETS ${EXAMPLE_FMHA_FWD} COMPONENT examples)
add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
@@ -67,8 +66,7 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
message("adding example ${EXAMPLE_FMHA_BWD}")
add_executable(${EXAMPLE_FMHA_BWD} fmha_bwd.cpp)
rocm_install(TARGETS ${EXAMPLE_FMHA_BWD} COMPONENT examples)
add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})

View File

@@ -26,8 +26,7 @@ add_custom_command(
set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
rocm_install(TARGETS ${EXAMPLE_LAYERNORM2D_FWD} COMPONENT examples)
add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})

View File

@@ -1,8 +1,5 @@
add_executable(tile_example_gemm_basic gemm_basic.cpp)
rocm_install(TARGETS tile_example_gemm_basic COMPONENT examples)
add_executable(tile_example_gemm_universal universal_gemm.cpp)
rocm_install(TARGETS tile_example_gemm_universal COMPONENT examples)
add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
if(CK_USE_OCP_FP8)
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)

View File

@@ -1 +0,0 @@
for file in gemm_universal_*; do mv "$file" "${file/f16_f16_f16/fp16_fp16_fp16}"; done

View File

@@ -1,4 +1,3 @@
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
add_executable(tile_example_img2col image_to_column.cpp)
rocm_install(TARGETS tile_example_img2col COMPONENT examples)
add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)

View File

@@ -3,9 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce")
# to be included in "make all/install/check"
message("adding example ${EXAMPLE_REDUCE}")
add_executable(${EXAMPLE_REDUCE} reduce.cpp)
rocm_install(TARGETS ${EXAMPLE_REDUCE} COMPONENT examples)
add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
set(EXAMPLE_REDUCE_COMPILE_OPTIONS)

View File

@@ -1,7 +1,6 @@
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
add_executable(tile_example_permute permute.cpp)
rocm_install(TARGETS tile_example_permute COMPONENT examples)
add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
# set(PERMUTE_USE_ALTERNATIVE_IMPL false)

View File

@@ -1,7 +1,6 @@
add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
rocm_install(TARGETS tile_example_topk_softmax COMPONENT examples)
add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)

View File

@@ -26,8 +26,7 @@ add_custom_command(
set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
message("adding ${TILE_RMSNORM2D_FWD}")
add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
rocm_install(TARGETS ${TILE_RMSNORM2D_FWD} COMPONENT examples)
add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
@@ -39,8 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno
target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
rocm_install(TARGETS ${EXAMPLE_RMSNORM2D_FWD} COMPONENT examples)
add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
# TODO: we have to turn off this global prop, otherwise the progress bar generated

View File

@@ -3,8 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
# to be included in "make all/install/check"
message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
file(GLOB INSTANCE_SRCS instances/*.cpp)
add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp)
rocm_install(TARGETS ${TILE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
@@ -16,8 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t
target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp)
rocm_install(TARGETS ${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
# TODO: we have to turn off this global prop, otherwise the progress bar generated

View File

@@ -67,14 +67,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
using ADataType = typename TypeConfig::ADataType;
using BDataType = typename TypeConfig::BDataType;
using GammaDataType = typename TypeConfig::GammaDataType;
using XDataType = typename TypeConfig::XDataType;
using UnquantYDataType = ck_tile::null_type;
using YScaleDataType = typename TypeConfig::YScaleDataType;
using QYDataType = typename TypeConfig::QYDataType;
using ComputeDataType = float;
using ADataType = typename TypeConfig::ADataType;
using BDataType = typename TypeConfig::BDataType;
using GammaDataType = typename TypeConfig::GammaDataType;
using XDataType = typename TypeConfig::XDataType;
using YScaleDataType = typename TypeConfig::YScaleDataType;
using QYDataType = typename TypeConfig::QYDataType;
using ComputeDataType = float;
// host verify
ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -89,7 +88,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -193,9 +191,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
GammaDataType,
ComputeDataType,
YDataType,
InvRmsDataType,
UnquantYDataType>(
x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
InvRmsDataType>(
x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
}
// yscale

View File

@@ -62,14 +62,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
assert(stride >= n);
using ADataType = DataType;
using BDataType = DataType;
using GammaDataType = DataType;
using XDataType = DataType;
using UnquantYDataType = ck_tile::null_type;
using YScaleDataType = float;
using QYDataType = ck_tile::int8_t;
using ComputeDataType = float;
using ADataType = DataType;
using BDataType = DataType;
using GammaDataType = DataType;
using XDataType = DataType;
using YScaleDataType = float;
using QYDataType = ck_tile::int8_t;
using ComputeDataType = float;
// host verify
ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -82,7 +81,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -195,9 +193,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
GammaDataType,
ComputeDataType,
YDataType,
InvRmsDataType,
UnquantYDataType>(
x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
InvRmsDataType>(
x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
}
// yscale

View File

@@ -2,8 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
message("adding ${TARGET_NAME}")
# not using add_example_executable() to add target, since we don't want this to have
# to be included in "make all/install/check"
add_executable(${TARGET_NAME} ${MAIN_SRC})
rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
foreach(source IN LISTS ARGN)

View File

@@ -1,5 +1,4 @@
add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp)
rocm_install(TARGETS tile_example_moe_sorting COMPONENT examples)
add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)

View File

@@ -2,8 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
message("adding ${TARGET_NAME}")
# not using add_example_executable() to add target, since we don't want this to have
# to be included in "make all/install/check"
add_executable(${TARGET_NAME} ${MAIN_SRC})
rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
foreach(source IN LISTS ARGN)

View File

@@ -3,8 +3,7 @@ set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
# to be included in "make all/install/check"
message("adding ${TILE_EXAPMLE_FUSED_MOE}")
file(GLOB INSTANCE_SRCS instances/*.cpp)
add_executable(${TILE_EXAPMLE_FUSED_MOE} main.cpp)
rocm_install(TARGETS ${TILE_EXAPMLE_FUSED_MOE} COMPONENT examples)
add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})

View File

@@ -1,2 +1 @@
add_executable(tile_example_batched_gemm batched_gemm.cpp)
rocm_install(TARGETS tile_example_batched_gemm COMPONENT examples)
add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)

View File

@@ -1,2 +1,2 @@
add_executable(tile_example_grouped_gemm grouped_gemm.cpp)
rocm_install(TARGETS tile_example_grouped_gemm COMPONENT examples)
add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)

View File

@@ -1,6 +1,4 @@
add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
rocm_install(TARGETS tile_example_flatmm_basic COMPONENT examples)
add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)

View File

@@ -1,9 +1,9 @@
set(TARGET_NAME tile_example_batched_transpose)
add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp)
rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp)
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})

View File

@@ -14,11 +14,8 @@ add_subdirectory(11_add_rmsnorm2d_rdquant)
add_subdirectory(12_smoothquant)
add_subdirectory(13_moe_sorting)
add_subdirectory(14_moe_smoothquant)
add_subdirectory(15_fused_moe)
add_subdirectory(16_batched_gemm)
add_subdirectory(17_grouped_gemm)
add_subdirectory(18_flatmm)
add_subdirectory(35_batched_transpose)
if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
add_subdirectory(15_fused_moe)
endif()

View File

@@ -6,7 +6,6 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host/concat.hpp"
#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
#include "ck_tile/host/concat.hpp"
namespace ck_tile {

View File

@@ -30,7 +30,8 @@ struct GemmPipelineProblemBase
using BLayout = remove_cvref_t<typename Traits::BLayout>;
using CLayout = remove_cvref_t<typename Traits::CLayout>;
static constexpr bool TransposeC = Traits::TransposeC;
static constexpr bool TransposeC = Traits::TransposeC;
static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();

View File

@@ -12,8 +12,7 @@ template <bool kPadM_,
bool kPadK_,
typename ALayout_,
typename BLayout_,
typename CLayout_,
bool UseStructuredSparsity_ = false>
typename CLayout_>
struct TileGemmTraits
{
static constexpr bool kPadM = kPadM_;
@@ -28,7 +27,7 @@ struct TileGemmTraits
using CLayout = CLayout_;
static constexpr bool TransposeC = false;
static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
static constexpr bool UseStructuredSparsity = false;
};
template <bool kPadM_,