From f675bb9252d98fc3ddcf9b8aae6111e5e89dda30 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 30 Apr 2025 22:39:56 +0000 Subject: [PATCH] Merge commit 'cfae8634313f804593d123b3ec51a43319f5fab1' into develop --- docs/doxygen/Doxyfile | 15 +++---- docs/index.rst | 6 +-- .../Composable-Kernel-API-reference.rst | 42 ------------------- docs/sphinx/_toc.yml.in | 6 +-- example/CMakeLists.txt | 4 +- example/ck_tile/01_fmha/CMakeLists.txt | 6 +-- example/ck_tile/02_layernorm2d/CMakeLists.txt | 3 +- example/ck_tile/03_gemm/CMakeLists.txt | 7 +--- example/ck_tile/03_gemm/stript.sh | 1 - example/ck_tile/04_img2col/CMakeLists.txt | 3 +- example/ck_tile/05_reduce/CMakeLists.txt | 4 +- example/ck_tile/06_permute/CMakeLists.txt | 3 +- .../ck_tile/09_topk_softmax/CMakeLists.txt | 5 +-- example/ck_tile/10_rmsnorm2d/CMakeLists.txt | 6 +-- .../11_add_rmsnorm2d_rdquant/CMakeLists.txt | 6 +-- .../add_rmsnorm2d_rdquant_fwd.cpp | 21 ++++------ .../example_add_rmsnorm2d_rdquant_fwd.cpp | 21 ++++------ example/ck_tile/12_smoothquant/CMakeLists.txt | 3 +- example/ck_tile/13_moe_sorting/CMakeLists.txt | 3 +- .../ck_tile/14_moe_smoothquant/CMakeLists.txt | 3 +- example/ck_tile/15_fused_moe/CMakeLists.txt | 3 +- .../ck_tile/16_batched_gemm/CMakeLists.txt | 3 +- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 4 +- example/ck_tile/18_flatmm/CMakeLists.txt | 4 +- .../35_batched_transpose/CMakeLists.txt | 4 +- example/ck_tile/CMakeLists.txt | 5 +-- .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp | 1 - .../gemm/pipeline/gemm_pipeline_problem.hpp | 3 +- .../ops/gemm/pipeline/tile_gemm_traits.hpp | 5 +-- 29 files changed, 62 insertions(+), 138 deletions(-) delete mode 100644 docs/reference/Composable-Kernel-API-reference.rst delete mode 100644 example/ck_tile/03_gemm/stript.sh mode change 100755 => 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index d6f38e0ca9..4367aabc95 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -42,19 +42,19 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "ck" +PROJECT_NAME = "Composable Kernel" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = v3.0.1.0 +PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HIP" +PROJECT_BRIEF = "Prototype interfaces compatible with ROCm platform and HiP" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 @@ -949,8 +949,8 @@ INPUT = ../../include/ck/tensor_operation/gpu/grid \ ../../include/ck/tensor_operation/gpu/block \ ../../include/ck/tensor_operation/gpu/thread \ ../../library/include/ck/library/utility \ - ../../include/ck/wrapper - + ../../include/ck/wrapper \ + ../../include/ck_tile # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1161,7 +1161,8 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = ../../README.md + +USE_MDFILE_AS_MAINPAGE = # The Fortran standard specifies that for fixed formatted Fortran code all # characters from position 72 are to be considered as comment. A common @@ -1370,7 +1371,7 @@ HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = ../_doxygen/extra_stylesheet.css # The HTML_COLORSTYLE tag can be used to specify if the generated HTML output # should be rendered with a dark or light theme. diff --git a/docs/index.rst b/docs/index.rst index 6d46eb49b1..4cc26a1d3e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,9 +35,9 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab * :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>` * :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>` * :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>` - * :ref:`api-reference` - * :ref:`wrapper` - + * :ref:`wrapper` + * :doc:`Composable Kernel complete class list <./doxygen/html/annotated>` + To contribute to the documentation refer to `Contributing to ROCm `_. You can find licensing information on the `Licensing `_ page. diff --git a/docs/reference/Composable-Kernel-API-reference.rst b/docs/reference/Composable-Kernel-API-reference.rst deleted file mode 100644 index b6ee9f7790..0000000000 --- a/docs/reference/Composable-Kernel-API-reference.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. meta:: - :description: Composable Kernel documentation and API reference library - :keywords: composable kernel, CK, ROCm, API, documentation - -.. _api-reference: - -******************************************************************** -Composable Kernel API reference guide -******************************************************************** - -This document contains details of the APIs for the Composable Kernel library and introduces some of the key design principles that are used to write new classes that extend the functionality of the Composable Kernel library. - -================= -DeviceMem -================= - -.. doxygenstruct:: DeviceMem - -============================= -Kernels For Flashattention -============================= - -The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists -the classes that are used in the CK GPU implementation of Flashattention. - -**Gridwise classes** - -.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle - -**Blockwise classes** - -.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1 - -.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2 - -.. doxygenstruct:: ck::BlockwiseSoftmax - -**Threadwise classes** - -.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic - -.. bibliography:: diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index df98998224..2ef3383d84 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -32,10 +32,10 @@ subtrees: title: Composable Kernel custom types - file: reference/Composable_Kernel_vector_utilities.rst title: Composable Kernel vector utilities - - file: reference/Composable-Kernel-API-reference.rst - title: Composable Kernel API reference - file: reference/Composable-Kernel-wrapper.rst - title: Composable Kernel Wrapper + title: Composable Kernel wrapper + - file: doxygen/html/annotated.rst + title: Composable Kernel class list - caption: About entries: diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 0e61fd33ef..996a543ecc 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -5,6 +5,7 @@ include_directories(BEFORE add_custom_target(examples) + # list of examples that are labelled as REGRESSION_EXAMPLE for make regression (runtime more than 30 seconds) # all other tests are labelled as SMOKE_EXAMPLE set(REGRESSION_EXAMPLES @@ -231,9 +232,6 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME) # add all example subdir file(GLOB dir_list LIST_DIRECTORIES true *) -if (NOT SUPPORTED_GPU_TARGETS MATCHES "gfx9") - list(FILTER dir_list EXCLUDE REGEX ".*/ck_tile") -endif() FOREACH(subdir ${dir_list}) if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt") add_subdirectory(${subdir}) diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt index ce3c8b3978..9ba3a453fc 100644 --- a/example/ck_tile/01_fmha/CMakeLists.txt +++ b/example/ck_tile/01_fmha/CMakeLists.txt @@ -58,8 +58,7 @@ set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd") # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" message("adding example ${EXAMPLE_FMHA_FWD}") -add_executable(${EXAMPLE_FMHA_FWD} fmha_fwd.cpp) -rocm_install(TARGETS ${EXAMPLE_FMHA_FWD} COMPONENT examples) +add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp) target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS}) @@ -67,8 +66,7 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd") # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" message("adding example ${EXAMPLE_FMHA_BWD}") -add_executable(${EXAMPLE_FMHA_BWD} fmha_bwd.cpp) -rocm_install(TARGETS ${EXAMPLE_FMHA_BWD} COMPONENT examples) +add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp) target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS}) diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt index 74f195a9db..fa69ac0f7a 100644 --- a/example/ck_tile/02_layernorm2d/CMakeLists.txt +++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt @@ -26,8 +26,7 @@ add_custom_command( set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") message("adding example ${EXAMPLE_LAYERNORM2D_FWD}") -add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp) -rocm_install(TARGETS ${EXAMPLE_LAYERNORM2D_FWD} COMPONENT examples) +add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS}) diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index deccb71d23..411db2e317 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,8 +1,5 @@ -add_executable(tile_example_gemm_basic gemm_basic.cpp) -rocm_install(TARGETS tile_example_gemm_basic COMPONENT examples) -add_executable(tile_example_gemm_universal universal_gemm.cpp) -rocm_install(TARGETS tile_example_gemm_universal COMPONENT examples) - +add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) +add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp) set(EXAMPLE_GEMM_COMPILE_OPTIONS) if(CK_USE_OCP_FP8) list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) diff --git a/example/ck_tile/03_gemm/stript.sh b/example/ck_tile/03_gemm/stript.sh deleted file mode 100644 index 4b91cb36ce..0000000000 --- a/example/ck_tile/03_gemm/stript.sh +++ /dev/null @@ -1 +0,0 @@ -for file in gemm_universal_*; do mv "$file" "${file/f16_f16_f16/fp16_fp16_fp16}"; done diff --git a/example/ck_tile/04_img2col/CMakeLists.txt b/example/ck_tile/04_img2col/CMakeLists.txt index d3737467d8..3864c9ed9d 100644 --- a/example/ck_tile/04_img2col/CMakeLists.txt +++ b/example/ck_tile/04_img2col/CMakeLists.txt @@ -1,4 +1,3 @@ # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" -add_executable(tile_example_img2col image_to_column.cpp) -rocm_install(TARGETS tile_example_img2col COMPONENT examples) +add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp) diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt index 855e59c48e..6caa38d50d 100644 --- a/example/ck_tile/05_reduce/CMakeLists.txt +++ b/example/ck_tile/05_reduce/CMakeLists.txt @@ -3,9 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce") # to be included in "make all/install/check" message("adding example ${EXAMPLE_REDUCE}") -add_executable(${EXAMPLE_REDUCE} reduce.cpp) -rocm_install(TARGETS ${EXAMPLE_REDUCE} COMPONENT examples) - +add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp) target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) set(EXAMPLE_REDUCE_COMPILE_OPTIONS) diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt index 22483a4295..327fceb685 100644 --- a/example/ck_tile/06_permute/CMakeLists.txt +++ b/example/ck_tile/06_permute/CMakeLists.txt @@ -1,7 +1,6 @@ # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" -add_executable(tile_example_permute permute.cpp) -rocm_install(TARGETS tile_example_permute COMPONENT examples) +add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp) if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL) # set(PERMUTE_USE_ALTERNATIVE_IMPL false) diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt index fc2a4d3fe0..b43b989792 100644 --- a/example/ck_tile/09_topk_softmax/CMakeLists.txt +++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt @@ -1,7 +1,6 @@ -add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp) -rocm_install(TARGETS tile_example_topk_softmax COMPONENT examples) - +add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp) target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) + set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS) # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt index 731ff639a4..5684c9b2e0 100644 --- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt +++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt @@ -26,8 +26,7 @@ add_custom_command( set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd") message("adding ${TILE_RMSNORM2D_FWD}") -add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp) -rocm_install(TARGETS ${TILE_RMSNORM2D_FWD} COMPONENT examples) +add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp) target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS}) @@ -39,8 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd") -add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp) -rocm_install(TARGETS ${EXAMPLE_RMSNORM2D_FWD} COMPONENT examples) +add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp) target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) # TODO: we have to turn off this global prop, otherwise the progress bar generated diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt index 7071127e01..6b0c3cef7a 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt @@ -3,8 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd") # to be included in "make all/install/check" message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}") file(GLOB INSTANCE_SRCS instances/*.cpp) -add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp) -rocm_install(TARGETS ${TILE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples) +add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp) target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS}) @@ -16,8 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd") -add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp) -rocm_install(TARGETS ${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples) +add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp) target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) # TODO: we have to turn off this global prop, otherwise the progress bar generated diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp index 7d82a16aa9..574edf64d3 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp @@ -67,14 +67,13 @@ bool run(const ck_tile::ArgParser& arg_parser) using TypeConfig = AddRmsnormRdquantTypeConfig; - using ADataType = typename TypeConfig::ADataType; - using BDataType = typename TypeConfig::BDataType; - using GammaDataType = typename TypeConfig::GammaDataType; - using XDataType = typename TypeConfig::XDataType; - using UnquantYDataType = ck_tile::null_type; - using YScaleDataType = typename TypeConfig::YScaleDataType; - using QYDataType = typename TypeConfig::QYDataType; - using ComputeDataType = float; + using ADataType = typename TypeConfig::ADataType; + using BDataType = typename TypeConfig::BDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using XDataType = typename TypeConfig::XDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = float; // host verify ck_tile::HostTensor a_host({m, n}, {stride, 1}); @@ -89,7 +88,6 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); - ck_tile::HostTensor unquant_y_host_ref({m, n}, {stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); ck_tile::FillUniformDistribution{-.5f, .5f}(b_host); @@ -193,9 +191,8 @@ bool run(const ck_tile::ArgParser& arg_parser) GammaDataType, ComputeDataType, YDataType, - InvRmsDataType, - UnquantYDataType>( - x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon); + InvRmsDataType>( + x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon); } // yscale diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp old mode 100755 new mode 100644 index 3aab357909..ada4c6f2da --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp @@ -62,14 +62,13 @@ bool run(const ck_tile::ArgParser& arg_parser) assert(stride >= n); - using ADataType = DataType; - using BDataType = DataType; - using GammaDataType = DataType; - using XDataType = DataType; - using UnquantYDataType = ck_tile::null_type; - using YScaleDataType = float; - using QYDataType = ck_tile::int8_t; - using ComputeDataType = float; + using ADataType = DataType; + using BDataType = DataType; + using GammaDataType = DataType; + using XDataType = DataType; + using YScaleDataType = float; + using QYDataType = ck_tile::int8_t; + using ComputeDataType = float; // host verify ck_tile::HostTensor a_host({m, n}, {stride, 1}); @@ -82,7 +81,6 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor yscale_host_dev({m}, {1}); ck_tile::HostTensor qy_host_ref({m, n}, {stride, 1}); ck_tile::HostTensor qy_host_dev({m, n}, {stride, 1}); - ck_tile::HostTensor unquant_y_host_ref({m, n}, {stride, 1}); ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); ck_tile::FillUniformDistribution{-.5f, .5f}(b_host); @@ -195,9 +193,8 @@ bool run(const ck_tile::ArgParser& arg_parser) GammaDataType, ComputeDataType, YDataType, - InvRmsDataType, - UnquantYDataType>( - x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon); + InvRmsDataType>( + x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon); } // yscale diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt index daeeb827bd..3849833aca 100644 --- a/example/ck_tile/12_smoothquant/CMakeLists.txt +++ b/example/ck_tile/12_smoothquant/CMakeLists.txt @@ -2,8 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC) message("adding ${TARGET_NAME}") # not using add_example_executable() to add target, since we don't want this to have # to be included in "make all/install/check" - add_executable(${TARGET_NAME} ${MAIN_SRC}) - rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples) + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) foreach(source IN LISTS ARGN) diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt index 662e16f0d3..09f3e4ac4e 100644 --- a/example/ck_tile/13_moe_sorting/CMakeLists.txt +++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt @@ -1,5 +1,4 @@ -add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp) -rocm_install(TARGETS tile_example_moe_sorting COMPONENT examples) +add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp) target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS) diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt index 9acb27552a..12224a39a2 100644 --- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt +++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt @@ -2,8 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC) message("adding ${TARGET_NAME}") # not using add_example_executable() to add target, since we don't want this to have # to be included in "make all/install/check" - add_executable(${TARGET_NAME} ${MAIN_SRC}) - rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples) + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) foreach(source IN LISTS ARGN) diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt index bb25a55c7d..a716eef19e 100644 --- a/example/ck_tile/15_fused_moe/CMakeLists.txt +++ b/example/ck_tile/15_fused_moe/CMakeLists.txt @@ -3,8 +3,7 @@ set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe") # to be included in "make all/install/check" message("adding ${TILE_EXAPMLE_FUSED_MOE}") file(GLOB INSTANCE_SRCS instances/*.cpp) -add_executable(${TILE_EXAPMLE_FUSED_MOE} main.cpp) -rocm_install(TARGETS ${TILE_EXAPMLE_FUSED_MOE} COMPONENT examples) +add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp) target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS}) diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt index 9eb7a45d80..78e78c6b04 100644 --- a/example/ck_tile/16_batched_gemm/CMakeLists.txt +++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt @@ -1,2 +1 @@ -add_executable(tile_example_batched_gemm batched_gemm.cpp) -rocm_install(TARGETS tile_example_batched_gemm COMPONENT examples) +add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp) diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt index 80d688125b..d34013dd6c 100644 --- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(tile_example_grouped_gemm grouped_gemm.cpp) -rocm_install(TARGETS tile_example_grouped_gemm COMPONENT examples) +add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) + diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt index 3a70f0447d..9fbe65e3a7 100644 --- a/example/ck_tile/18_flatmm/CMakeLists.txt +++ b/example/ck_tile/18_flatmm/CMakeLists.txt @@ -1,6 +1,4 @@ -add_executable(tile_example_flatmm_basic flatmm_basic.cpp) -rocm_install(TARGETS tile_example_flatmm_basic COMPONENT examples) - +add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp) set(EXAMPLE_FLATMM_COMPILE_OPTIONS) # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) diff --git a/example/ck_tile/35_batched_transpose/CMakeLists.txt b/example/ck_tile/35_batched_transpose/CMakeLists.txt index 10101e4d2e..a08fcebb74 100644 --- a/example/ck_tile/35_batched_transpose/CMakeLists.txt +++ b/example/ck_tile/35_batched_transpose/CMakeLists.txt @@ -1,9 +1,9 @@ set(TARGET_NAME tile_example_batched_transpose) -add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp) -rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples) +add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS}) + diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 16f68c6255..88efe0d8d9 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -14,11 +14,8 @@ add_subdirectory(11_add_rmsnorm2d_rdquant) add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) +add_subdirectory(15_fused_moe) add_subdirectory(16_batched_gemm) add_subdirectory(17_grouped_gemm) add_subdirectory(18_flatmm) add_subdirectory(35_batched_transpose) - -if (SUPPORTED_GPU_TARGETS MATCHES "gfx94") - add_subdirectory(15_fused_moe) -endif() diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp index ad6641bc13..611aff318f 100644 --- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -6,7 +6,6 @@ #include "ck_tile/core.hpp" #include "ck_tile/host/concat.hpp" #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp" -#include "ck_tile/host/concat.hpp" namespace ck_tile { diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp index 893c9d1ad3..0b38e7789e 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp @@ -30,7 +30,8 @@ struct GemmPipelineProblemBase using BLayout = remove_cvref_t; using CLayout = remove_cvref_t; - static constexpr bool TransposeC = Traits::TransposeC; + static constexpr bool TransposeC = Traits::TransposeC; + static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity; static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp index ecf861e4e8..a31004b425 100644 --- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp +++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp @@ -12,8 +12,7 @@ template + typename CLayout_> struct TileGemmTraits { static constexpr bool kPadM = kPadM_; @@ -28,7 +27,7 @@ struct TileGemmTraits using CLayout = CLayout_; static constexpr bool TransposeC = false; - static constexpr bool UseStructuredSparsity = UseStructuredSparsity_; + static constexpr bool UseStructuredSparsity = false; }; template