From 25e2e0f04a451dd6d8cbc3edf8c02a12493e7f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 30 Jan 2025 11:57:39 +0100 Subject: [PATCH 01/18] [CK TILE] Implement cschuflle algorithm (#1842) * [CK TILE] Implement cschuflle algorithm * Rebase * Vector store size fixes * fixes * Fixes * fixes * fmha fix * fixes * fixes of fixes --- example/ck_tile/03_gemm/gemm_basic.cpp | 37 +-- example/ck_tile/03_gemm/universal_gemm.cpp | 18 +- .../ck_tile/16_batched_gemm/batched_gemm.cpp | 42 +-- .../ck_tile/17_grouped_gemm/grouped_gemm.cpp | 46 ++- .../ops/epilogue/cshuffle_epilogue.hpp | 307 +++++++++--------- .../ops/epilogue/default_2d_epilogue.hpp | 101 +++++- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 49 +-- .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp | 2 + .../gemm_pipeline_ag_bg_cr_comp_v3.hpp | 17 +- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 15 +- .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 10 +- ...ine_agmem_bgmem_creg_v1_default_policy.hpp | 6 +- .../gemm_pipeline_agmem_bgmem_creg_v2.hpp | 6 +- .../gemm/pipeline/gemm_pipeline_problem.hpp | 5 +- ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 6 - .../batched_gemm/test_batched_gemm_util.hpp | 43 +-- test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 21 +- .../grouped_gemm/test_grouped_gemm_util.hpp | 48 ++- 18 files changed, 408 insertions(+), 371 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index c3a66ba3ea..81fbd96323 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -20,10 +20,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& constexpr bool kPadN = false; constexpr bool kPadK = false; - constexpr bool kTilePermute = false; - // The rank and permutation will also be generate out by the CodeGen part. - constexpr ck_tile::index_t kOutputRank = 2; - constexpr int kBlockPerCu = 1; // This part comes from the Codegen @@ -39,11 +35,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - // Whether doing the CShuffle (transpose before the global memory), depending on the output - // layout. - constexpr bool CShuffleEpilogue = - std::is_same_v; - using CodegenGemmShape = ck_tile::TileGemmShape, ck_tile::sequence, @@ -51,26 +42,24 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& using TilePartitioner = ck_tile::GemmTile2DPartitioner; - using GemmEpilogue = std::conditional_t< - CShuffleEpilogue, - ck_tile::CShuffleEpilogue>, - ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>>; - using CodegenGemmTraits = ck_tile::TileGemmTraits; using CodegenPipelineProblem = ck_tile:: GemmPipelineProblem; using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::GemmKernel; diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 5d2bd2df31..fb43e6f504 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -60,9 +60,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& ck_tile::sequence>; using TilePartitioner = ck_tile::GemmTile2DPartitioner; - using GemmEpilogue = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>; - using Traits = ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile:: TileGemmUniversalTraits; @@ -95,6 +92,19 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& using GemmPipeline = GEMM_PIPELINE; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index 720802236c..2a1cd58255 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -19,12 +19,9 @@ template float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s) { // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadM = false; - constexpr bool kPadN = false; - constexpr bool kPadK = false; - constexpr bool kTilePermute = false; - // The rank and permutation will also be generate out by the CodeGen part. - constexpr ck_tile::index_t kOutputRank = 2; + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; constexpr int kBlockPerCu = 1; @@ -41,11 +38,6 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - // Whether doing the CShuffle (transpose before the global memory), depending on the output - // layout. - constexpr bool CShuffleEpilogue = - std::is_same_v; - using CodegenGemmShape = ck_tile::TileGemmShape, ck_tile::sequence, @@ -53,26 +45,24 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre using TilePartitioner = ck_tile::GemmTile2DPartitioner; - using GemmEpilogue = std::conditional_t< - CShuffleEpilogue, - ck_tile::CShuffleEpilogue>, - ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>>; - using CodegenGemmTraits = ck_tile::TileGemmTraits; using CodegenPipelineProblem = ck_tile:: GemmPipelineProblem; using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. using Kernel = ck_tile::BatchedGemmKernel; diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index bb4bdbf514..c32fac6c0d 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -20,12 +20,9 @@ namespace { struct GroupedGemmKernelParam { - static const bool kPadM = false; - static const bool kPadN = false; - static const bool kPadK = false; - static const bool kTilePermute = false; - - static const ck_tile::index_t kOutputRank = 2; + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; static const int kBlockPerCu = 1; static const ck_tile::index_t M_Tile = 128; @@ -54,24 +51,6 @@ using CodegenGemmShape = using TilePartitioner = ck_tile::GemmTile1DPartitioner; -template -using GemmEpilogue = std::conditional_t< - std::is_same_v, - ck_tile::CShuffleEpilogue>, - ck_tile::Default2DEpilogue>>; - template using CodegenGemmTraits = ck_tile::TileGemmTraits using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1>; +template +using GemmEpilogue = ck_tile::CShuffleEpilogue::kBlockSize, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + GroupedGemmKernelParam::M_Warp, + GroupedGemmKernelParam::N_Warp, + GroupedGemmKernelParam::M_Warp_Tile, + GroupedGemmKernelParam::N_Warp_Tile, + GroupedGemmKernelParam::K_Warp_Tile, + CodegenPipelineProblem::TransposeC>>; + template using Kernel = ck_tile::GroupedGemmKernel, - GemmEpilogue>; + GemmEpilogue>; }; // namespace std::size_t get_workspace_size(const std::vector& gemm_descs) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 01105d2a82..4aba3d7ec1 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -1,194 +1,189 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck_tile/core.hpp" - -#define CK_TILE_MAX_RANK 5 +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" namespace ck_tile { -// this epilogue aiming to store a matrix with different layout from the shared memory to the global -// memory. template + typename CLayout_, + index_t kBlockSize_, + index_t kM_, + index_t kN_, + index_t kMWave_, + index_t kNWave_, + index_t kMPerXdl_, + index_t kNPerXdl_, + index_t kKPerXdl_, + bool isCTransposed_> struct CShuffleEpilogueProblem { - using AccDataType = remove_cvref_t; - using ODataType = remove_cvref_t; - static constexpr bool kPadM = kPadM_; - static constexpr bool kPadN = kPadN_; - static constexpr bool kTilePermute = kTilePermute_; - static constexpr index_t kRank = kRank_; - static constexpr index_t kPerm[CK_TILE_MAX_RANK] = {kPerm0, kPerm1, kPerm2, kPerm3, kPerm4}; - static constexpr index_t tile_sizes[CK_TILE_MAX_RANK] = { - TileSize0, TileSize1, TileSize2, TileSize3, TileSize4}; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t kBlockSize = kBlockSize_; + static constexpr index_t kMPerBlock = kM_; + static constexpr index_t kNPerBlock = kN_; + static constexpr index_t kMWave = kMWave_; + static constexpr index_t kNWave = kNWave_; + static constexpr index_t kMPerXdl = kMPerXdl_; + static constexpr index_t kNPerXdl = kNPerXdl_; + static constexpr index_t kKPerXdl = kKPerXdl_; + static constexpr index_t isCTransposed = isCTransposed_; }; template struct CShuffleEpilogue { - using Problem = remove_cvref_t; - using AccDataType = remove_cvref_t; - using ODataType = remove_cvref_t; - static constexpr bool kPadM = Problem::kPadM; - static constexpr bool kPadN = Problem::kPadN; - const index_t* kPerm = Problem::kPerm; - static constexpr bool kTilePermute = Problem::kTilePermute; - static constexpr index_t kRank = Problem::kRank; - const index_t* tile_sizes = Problem::tile_sizes; + using Problem = remove_cvref_t; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t kBlockSize = Problem::kBlockSize; + static constexpr index_t kMPerBlock = Problem::kMPerBlock; + static constexpr index_t kNPerBlock = Problem::kNPerBlock; + static constexpr index_t kMWave = Problem::kMWave; + static constexpr index_t kNWave = Problem::kNWave; + static constexpr index_t kMPerXdl = Problem::kMPerXdl; + static constexpr index_t kNPerXdl = Problem::kNPerXdl; + static constexpr index_t kKPerXdl = Problem::kKPerXdl; + static constexpr index_t isCTransposed = Problem::isCTransposed; + static constexpr index_t kMPerIteration = kMPerXdl * kMWave; + static constexpr index_t kNPerIteration = kNPerXdl * kNWave; - // No additional shared memory needed - CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; } + using WG = WarpGemmMfmaDispatcher; - CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() + using CWarpDstr = typename WG::CWarpDstr; + using CWarpTensor = typename WG::CWarpTensor; + + /** + * @brief Get the vector store size for C tensor. + * + * @note The vector store size for output C tensor would depend on multiple factors + * like its data layout and warp gemm C transposition. In general it would + * be the number of consecutive elements in contiguous C dimension hold by + * single thread. + * + * @return The vector store size for C tensor. + */ + CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC() { - // TODO: At now CShuffle doesn't allow to vector store after permute. - // It should be fixed and this function should return true. - return false; + constexpr index_t MaxVectorStoreSize = 16; + return MaxVectorStoreSize / sizeof(ODataType); } - template - CK_TILE_DEVICE void permute_tile_data(OAccTile& o_acc_tile) + template + CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor() { - using DataType = typename OAccTile::DataType; - - // Get thread buffer - auto& thread_buf = o_acc_tile.get_thread_buffer(); - - // Create a temporary buffer to hold the permuted data - thread_buffer permuted_thread_buf; - - // Get the lengths of each dimension - auto thread_tensor_lengths = o_acc_tile.get_lengths(); - - // Total number of elements - index_t total_elements = OAccTile::kThreadElementSpaceSize; - - // Iterate over all elements - for(index_t linear_idx = 0; linear_idx < total_elements; ++linear_idx) + // N is contiguous dimension + if constexpr(std::is_same_v) { - // Convert linear index to multi-dimensional indices - array indices; - index_t remaining = linear_idx; - static_for<0, kRank, 1>{}([&](auto i) { - constexpr auto rev_i = kRank - 1 - i; - indices(rev_i) = remaining % thread_tensor_lengths.get(number{}); - remaining /= thread_tensor_lengths.get(number{}); - }); - - // Apply the permutation - array permuted_indices; - static_for<0, kRank, 1>{}( - [&](auto i) { permuted_indices(i) = indices.get(number{}); }); - - // Compute offsets - index_t dst_offset = 0; - index_t stride = 1; - - static_for<0, kRank, 1>{}([&](auto i) { - constexpr auto rev_i = kRank - 1 - i; - dst_offset += permuted_indices[rev_i] * stride; - stride *= thread_tensor_lengths.get(number{}); - }); - - // Move the data - permuted_thread_buf(dst_offset) = thread_buf[linear_idx]; + return make_naive_tensor_descriptor( + make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{})); } - - // Copy the permuted data back to the original thread buffer - for(index_t i = 0; i < total_elements; ++i) + // M is contiguous dimension + else if constexpr(std::is_same_v) { - thread_buf.set_as(i, permuted_thread_buf.get(i)); - } - } - - template - CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, OAccTile& o_acc_tile) - { - const auto& current_window_origin = o_dram_window_tmp.get_window_origin(); - - // Compute the tile coordinates by dividing the window origin by the tile sizes - index_t tile_coords[CK_TILE_MAX_RANK] = {0}; - for(index_t i = 0; i < kRank; ++i) - { - tile_coords[i] = current_window_origin[i] / tile_sizes[i]; - // printf("The tile_coord is: %d", tile_coords[i]); - } - - // Apply the permutation to the tile coordinates - index_t permuted_tile_coords[CK_TILE_MAX_RANK]; - for(index_t i = 0; i < kRank; ++i) - { - permuted_tile_coords[i] = tile_coords[kPerm[i]]; - // printf("The new permuted_tile_coords is: %d", permuted_tile_coords[i]); - } - - // Compute the permuted window origin - index_t permuted_window_origin[CK_TILE_MAX_RANK] = {0}; - for(index_t i = 0; i < kRank; ++i) - { - permuted_window_origin[i] = permuted_tile_coords[i] * tile_sizes[i]; - // printf("The new permuted_window_origin is: %d", permuted_window_origin[i]); - } - - typename ODramWindowTmp::BottomTensorIndex step = {}; - for(index_t i = 0; i < kRank; ++i) - { - step[i] = permuted_window_origin[i] - current_window_origin[i]; - } - - // Move the window - move_tile_window(o_dram_window_tmp, step); - - // Permute the data within the tile if necessary - if constexpr(kTilePermute) - { - permute_tile_data(o_acc_tile); - } - - // Store the tile data to the permuted location - if constexpr(kPadM || kPadN) - { - if constexpr(out_memory_data_op == memory_operation_enum::set) - { - store_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); - } - else - { - update_tile_raw(o_dram_window_tmp, cast_tile(o_acc_tile)); - } - buffer_store_fence(); + return make_naive_tensor_descriptor( + make_tuple(number{}, number{}), + make_tuple(number<1>{}, number{})); } else { + static_assert(false, "Unsupported CLayout!"); + } + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return kMWave * kNWave * kMPerXdl * kNPerXdl * sizeof(ODataType); + } + + template + CK_TILE_DEVICE auto + operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem) + { + + const index_t iMWarp = get_warp_id() / kNWave; + const index_t iNWarp = get_warp_id() - iMWarp * kNWave; + + constexpr auto lds_block_desc = MakeLdsBlockDescriptor(); + auto o_lds_block = make_tensor_view( + static_cast(p_smem), lds_block_desc); + auto in_lds_window = + make_tile_window(o_lds_block, + make_tuple(number{}, number{}), + {number{} * iMWarp, number{} * iNWarp}); + auto out_lds_window = + make_tile_window(o_lds_block, + make_tuple(number{}, number{}), + {0, 0}); + + using SFC = space_filling_curve, + sequence<0, 1>, + sequence>; + constexpr index_t num_access = SFC::get_num_of_access(); + + using TileEncodingPattern = + TileDistributionEncodingPattern2D; + constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution(); + + constexpr auto c_warp_y_lengths = + to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths()); + constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; + + CWarpTensor c_warp_in_tensor; + static_for<0, num_access, 1>{}([&](auto iAccess) { + constexpr auto idx_y_start = SFC::get_index(iAccess); + + constexpr auto mIter = number{}) / (kMPerXdl * kMWave)>{}; + constexpr auto nIter = number{}) / (kNPerXdl * kNWave)>{}; + + c_warp_in_tensor.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data( + merge_sequences(sequence{}, c_warp_y_index_zeros), + merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); + + const auto c_warp_in_tensor_casted = cast_tile(c_warp_in_tensor); + + block_sync_lds(); + store_tile(in_lds_window, c_warp_in_tensor_casted); + block_sync_lds(); + + const auto c_out_tensor = + load_tile(make_tile_window(out_lds_window, dram_tile_distribution)); + if constexpr(out_memory_data_op == memory_operation_enum::set) { - store_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + store_tile(out_dram_window, c_out_tensor); } else { - update_tile(o_dram_window_tmp, cast_tile(o_acc_tile)); + update_tile(out_dram_window, c_out_tensor); } - } + if constexpr(iAccess != num_access - 1) + { + constexpr auto step = SFC::get_forward_step(iAccess); + move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})}); + } + }); } }; - } // namespace ck_tile diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp index 177573de34..6e290fe6d7 100644 --- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp @@ -1,9 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" namespace ck_tile { @@ -23,6 +25,26 @@ struct Default2DEpilogueProblem static constexpr bool UseRawStore = UseRawStore_; }; +template +struct DefaultGemm2DEpilogueProblem + : public Default2DEpilogueProblem +{ + using CLayout = remove_cvref_t; + static constexpr index_t kMPerXdl = kMPerXdl_; + static constexpr index_t kNPerXdl = kNPerXdl_; + static constexpr index_t kKPerXdl = kKPerXdl_; + static constexpr index_t isCTransposed = isCTransposed_; +}; + template struct Default2DEpilogue { @@ -35,14 +57,13 @@ struct Default2DEpilogue CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; } - CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() { return false; } - // TODO: this function assume store out vector size is the same as OAccTile last dimension size // how do we fix this ? template - CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile) + CK_TILE_DEVICE auto + operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr) { // TODO: this is ugly @@ -71,4 +92,76 @@ struct Default2DEpilogue } } }; + +template +struct DefaultGemm2DEpilogue : public Default2DEpilogue +{ + using Problem = remove_cvref_t; + using AccDataType = remove_cvref_t; + using ODataType = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t kMPerXdl = Problem::kMPerXdl; + static constexpr index_t kNPerXdl = Problem::kNPerXdl; + static constexpr index_t kKPerXdl = Problem::kKPerXdl; + static constexpr index_t isCTransposed = Problem::isCTransposed; + + using WG = WarpGemmMfmaDispatcher; + + using CWarpDstr = typename WG::CWarpDstr; + + CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC() + { + // N is contiguous dimension + if constexpr(std::is_same_v) + { + if constexpr(isCTransposed) + { + // In this case each thread has multiple consecutive elements in + // N dimension, however consecutive threads' elements have stride. + constexpr index_t NDimY = CWarpDstr::NDimY; + constexpr auto c_warp_y_lengths = + CWarpDstr{}.get_ys_to_d_descriptor().get_lengths(); + static_assert(WG::WarpGemmAttribute::Impl::kCM1PerLane == + c_warp_y_lengths.get(number{})); + return c_warp_y_lengths.get(number{}); + } + else + { + // In this case each thread has just a single item in Ndim + return WG::WarpGemmAttribute::Impl::kCNLane / WG::kN; + } + } + // M is contiguous dimension + else if constexpr(std::is_same_v) + { + if constexpr(isCTransposed) + { + // In this case each thread has just a single item in Mdim + return WG::WarpGemmAttribute::Impl::kCNLane / WG::kN; + } + else + { + // In this case each thread has multiple consecutive elements in + // M dimension, however consecutive threads' elements have stride. + constexpr index_t NDimY = CWarpDstr::NDimY; + constexpr auto c_warp_y_lengths = + CWarpDstr{}.get_ys_to_d_descriptor().get_lengths(); + static_assert(WG::WarpGemmAttribute::Impl::kCM1PerLane == + c_warp_y_lengths.get(number{})); + return c_warp_y_lengths.get(number{}); + } + } + else + { + static_assert(false, "Unsupported CLayout!"); + } + } +}; + } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 8d640831df..774736e1fa 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -159,12 +159,8 @@ struct GemmKernel CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs) { - constexpr bool is_output_c_reg_transposed = - EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC(); - if constexpr(!((GemmPipeline::VectorSizeC % 2 == 0 && - std::is_same_v && - is_output_c_reg_transposed) || - !(std::is_same_v || std::is_same_v))) + if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 && + is_any_of::value) { if(kargs.KBatch != 1) { @@ -182,7 +178,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.K % GemmPipeline::VectorSizeA != 0) + if(kargs.K % GemmPipeline::GetVectorSizeA() != 0) { std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl; return false; @@ -197,7 +193,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.M % GemmPipeline::VectorSizeA != 0) + if(kargs.M % GemmPipeline::GetVectorSizeA() != 0) { std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl; return false; @@ -213,7 +209,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.N % GemmPipeline::VectorSizeB != 0) + if(kargs.N % GemmPipeline::GetVectorSizeB() != 0) { std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl; return false; @@ -228,7 +224,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.K % GemmPipeline::VectorSizeB != 0) + if(kargs.K % GemmPipeline::GetVectorSizeB() != 0) { std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl; return false; @@ -244,7 +240,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.N % GemmPipeline::VectorSizeC != 0) + if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0) { std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl; return false; @@ -259,7 +255,7 @@ struct GemmKernel << std::endl; return false; } - if(kargs.M % GemmPipeline::VectorSizeC != 0) + if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0) { std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl; return false; @@ -275,14 +271,6 @@ struct GemmKernel const GemmKernelArgs& kargs, const SplitKBatchOffset& splitk_batch_offset) { - // const auto idxs = TilePartitioner{}(); - // const auto i_m = idxs.at(number<0>{}); - // const auto i_n = idxs.at(number<1>{}); - // // options - // const ADataType* a_start = static_cast(kargs.a_ptr); - // const BDataType* b_start = static_cast(kargs.b_ptr); - // // Convert pointers to tensor views - // auto a_tensor_view = [&]() { const auto& a_tensor_view = [&]() { if constexpr(std::is_same_v) { @@ -290,7 +278,7 @@ struct GemmKernel a_ptr, make_tuple(kargs.M, splitk_batch_offset.splitted_k), make_tuple(kargs.stride_A, 1), - number{}, + number{}, number<1>{}); } else @@ -299,7 +287,7 @@ struct GemmKernel a_ptr, make_tuple(splitk_batch_offset.splitted_k, kargs.M), make_tuple(kargs.stride_A, 1), - number{}, + number{}, number<1>{}); } }(); @@ -311,7 +299,7 @@ struct GemmKernel b_ptr, make_tuple(splitk_batch_offset.splitted_k, kargs.N), make_tuple(kargs.stride_B, 1), - number{}, + number{}, number<1>{}); } else @@ -320,7 +308,7 @@ struct GemmKernel b_ptr, make_tuple(kargs.N, splitk_batch_offset.splitted_k), make_tuple(kargs.stride_B, 1), - number{}, + number{}, number<1>{}); } }(); @@ -333,7 +321,7 @@ struct GemmKernel c_ptr, make_tuple(kargs.M, kargs.N), make_tuple(kargs.stride_C, 1), - number{}, + number{}, number<1>{}); } else @@ -501,16 +489,13 @@ struct GemmKernel // Run Epilogue Pipeline auto& c_block_window = gemm_tile_windows.at(I2); - constexpr bool is_output_c_reg_transposed = - EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC(); - if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) || - (GemmPipeline::VectorSizeC % 2 == 0 && - std::is_same_v && - is_output_c_reg_transposed)) + if constexpr(DstInMemOp == memory_operation_enum::set || + !(EpiloguePipeline::GetVectorSizeC() % 2 != 0 && + is_any_of::value)) { EpiloguePipeline{} .template operator()( - c_block_window, c_block_tile); + c_block_window, c_block_tile, smem_ptr); } } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp index 6acc547dbf..c08fe45465 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp @@ -21,6 +21,8 @@ struct GemmPipelineAgBgCrImplBase static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; + CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } + template CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, SrcTileWindow& dram_tile_window, diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp index 70de4014c1..0bd7807238 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrCompV3 static constexpr index_t PrefillStages = 1; static constexpr index_t GlobalBufferNum = 1; + CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } + CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; @@ -62,9 +64,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; - static constexpr index_t VectorSizeA = Policy::template GetVectorSizeA(); - static constexpr index_t VectorSizeB = Policy::template GetVectorSizeB(); - static constexpr index_t VectorSizeC = Policy::template GetVectorSizeC(); + static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA(); } + static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB(); } + static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC(); } static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadN = Problem::kPadN; @@ -81,11 +83,6 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 return Policy::template GetSmemSize(); } - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() - { - return Policy::template IsTransposeC(); - } - template struct PipelineImpl : public PipelineImplBase { @@ -110,9 +107,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 constexpr index_t B_LDS_Read_Width = KPerXDL; constexpr index_t A_Buffer_Load_Inst_Num = - MPerBlock * KPerBlock / (BlockSize * VectorSizeA); + MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA()); constexpr index_t B_Buffer_Load_Inst_Num = - NPerBlock * KPerBlock / (BlockSize * VectorSizeB); + NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB()); constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL); constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 1d6a9a0b87..38c663f4c3 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrMem using BDataType = remove_cvref_t; using BlockGemmShape = remove_cvref_t; + CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } + static constexpr index_t BlockSize = Problem::kBlockSize; static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t NPerBlock = BlockGemmShape::kN; @@ -113,9 +115,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; - static constexpr index_t VectorSizeA = Policy::template GetVectorSizeA(); - static constexpr index_t VectorSizeB = Policy::template GetVectorSizeB(); - static constexpr index_t VectorSizeC = Policy::template GetVectorSizeC(); + static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA(); } + static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB(); } + static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC(); } static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadN = Problem::kPadN; @@ -133,11 +135,6 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem return Policy::template GetSmemSize(); } - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() - { - return Policy::template IsTransposeC(); - } - template struct PipelineImpl : public PipelineImplBase { diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index ccb2f81d4b..d9f04a87c3 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -31,21 +31,21 @@ struct GemmPipelineAGmemBGmemCRegV1 static constexpr index_t kNPerBlock = BlockGemmShape::kN; static constexpr index_t kKPerBlock = BlockGemmShape::kK; - static constexpr index_t VectorSizeA = Problem::VectorSizeA; - static constexpr index_t VectorSizeB = Problem::VectorSizeB; - static constexpr index_t VectorSizeC = Problem::VectorSizeC; + static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; } + static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; } + static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; } static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadN = Problem::kPadN; static constexpr bool kPadK = Problem::kPadK; + CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); } - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } - template {}; static constexpr auto I2 = number<2>{}; - static constexpr bool TransposeC = true; - // 3d + padding template CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor() @@ -383,8 +381,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy } } - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; } - template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { @@ -397,7 +393,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy WarpTile::at(I0), WarpTile::at(I1), WarpTile::at(I2), - TransposeC>; + Problem::TransposeC>; using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy().get_element_space_size(); } - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); } - template ; using CLayout = remove_cvref_t; + static constexpr bool TransposeC = Traits::TransposeC; + static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); static constexpr bool kPadM = Traits::kPadM; @@ -111,7 +113,6 @@ struct GemmPipelineProblemBase return kPadK ? 1 : GetAlignmentB(); } }(); - static constexpr index_t VectorSizeC = []() { if constexpr(std::is_same_v) { diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 31a837aa45..33f105a435 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -549,12 +549,6 @@ struct UniversalGemmPipelineAgBgCrPolicy return TileEncodingPattern::MakeShuffled2DStaticTileDistribution(); } - template - CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() - { - return Problem::TransposeC; - } - template CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() { diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp index ab534ffcfa..047e0a2939 100644 --- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -29,12 +29,9 @@ class TestCkTileBatchedGemm : public ::testing::Test const ck_tile::stream_config& s) { // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. - constexpr bool kPadM = false; - constexpr bool kPadN = false; - constexpr bool kPadK = false; - constexpr bool kTilePermute = false; - // The rank and permutation will also be generate out by the CodeGen part. - constexpr ck_tile::index_t kOutputRank = 2; + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; constexpr int kBlockPerCu = 1; @@ -51,11 +48,6 @@ class TestCkTileBatchedGemm : public ::testing::Test constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 8; - // Whether doing the CShuffle (transpose before the global memory), depending on the output - // layout. - constexpr bool CShuffleEpilogue = - std::is_same_v; - using CodegenGemmShape = ck_tile::TileGemmShape, ck_tile::sequence, @@ -63,21 +55,6 @@ class TestCkTileBatchedGemm : public ::testing::Test using TilePartitioner = ck_tile::GemmTile2DPartitioner; - using GemmEpilogue = std::conditional_t< - CShuffleEpilogue, - ck_tile::CShuffleEpilogue>, - ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>>; - using CodegenGemmTraits = ck_tile::TileGemmTraits; @@ -88,6 +65,20 @@ class TestCkTileBatchedGemm : public ::testing::Test CodegenGemmTraits>; using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; using Kernel = ck_tile::BatchedGemmKernel; diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index 1474498726..647b54cb8e 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include @@ -65,9 +65,6 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::sequence>; using TilePartitioner = ck_tile::GemmTile2DPartitioner; - using GemmEpilogue = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>; - using Traits = ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile:: TileGemmUniversalTraits; @@ -106,6 +103,20 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::GemmPipelineAgBgCrCompV3>; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); @@ -244,7 +255,7 @@ class TestCkTileGemmPipeline : public ::testing::Test public: std::vector k_batches_; - void SetUp() override { k_batches_ = {1}; } + void SetUp() override { k_batches_ = {1, 2}; } template void Run(const int M, diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp index a1b767d853..6b9bf0c6f7 100644 --- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include @@ -26,12 +26,9 @@ class TestCkTileGroupedGemm : public ::testing::Test struct GroupedGemKernelParam { - static const bool kPadM = false; - static const bool kPadN = false; - static const bool kPadK = false; - static const bool kTilePermute = false; - - static const ck_tile::index_t kOutputRank = 2; + static const bool kPadM = false; + static const bool kPadN = false; + static const bool kPadK = false; static const int kBlockPerCu = 1; static const ck_tile::index_t M_Tile = 128; @@ -60,26 +57,6 @@ class TestCkTileGroupedGemm : public ::testing::Test using TilePartitioner = ck_tile::GemmTile1DPartitioner; - template - using GemmEpilogue = - std::conditional_t, - ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>, - ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem>>; - template using CodegenGemmTraits = ck_tile::TileGemmTraits>; + template + using GemmEpilogue = ck_tile::CShuffleEpilogue::BlockSize, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + GroupedGemKernelParam::M_Warp, + GroupedGemKernelParam::N_Warp, + GroupedGemKernelParam::M_Warp_Tile, + GroupedGemKernelParam::N_Warp_Tile, + GroupedGemKernelParam::K_Warp_Tile, + CodegenPipelineProblem::TransposeC>>; + template using Kernel = ck_tile::GroupedGemmKernel, - GemmEpilogue>; + GemmEpilogue>; using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; std::size_t GetWorkspaceSize(const std::vector& gemm_descs) From dcbfa795420d352aaa9af447df69af15653a4c1d Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 30 Jan 2025 07:03:48 -0800 Subject: [PATCH 02/18] turn on the ck_tile gemm tests by default (#1849) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2d8f7561f2..b212d2d0ab 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -796,7 +796,7 @@ pipeline { booleanParam( name: "RUN_CK_TILE_GEMM_TESTS", defaultValue: false, - description: "Run the ck_tile GEMM tests (default: OFF)") + description: "Run the ck_tile GEMM tests (default: ON)") booleanParam( name: "BUILD_INSTANCES_ONLY", defaultValue: false, From e6d418049834ca70c5756c6736947dc7a42a4740 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 Jan 2025 07:04:27 -0800 Subject: [PATCH 03/18] Bump rocm-docs-core from 1.14.1 to 1.15.0 in /docs/sphinx (#1848) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.14.1 to 1.15.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.14.1...v1.15.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 7f48a51ce8..e9df8c9f5f 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.14.1 +rocm-docs-core==1.15.0 sphinxcontrib-bibtex==2.6.3 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 0332e19bc7..a42fdf09bf 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -199,7 +199,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.14.1 +rocm-docs-core==1.15.0 # via -r requirements.in rpds-py==0.22.3 # via From ce448002ee9988d51b87039fa7769c942f05fd45 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Fri, 31 Jan 2025 00:10:16 +0100 Subject: [PATCH 04/18] [CK Tile] Spatially local GEMM tile partitioner. (#1843) * Add spatially local tile partitioner * Use 1D Grid size & create partitioner object. * Docs & use 1D partitioner in example. * Clang format. * Change kernel grid size Now: X is the # of output C-tiles, Y is the batch count Z is the splitK * Formatting & more doc. * Clang format. * Fix batched gemm test. Use 1d partitioner. * Move condition. * FIx ctor. * clang-format. --- example/ck_tile/03_gemm/gemm_basic.cpp | 2 +- example/ck_tile/03_gemm/gemm_basic.hpp | 2 +- example/ck_tile/03_gemm/universal_gemm.cpp | 7 +- .../ck_tile/16_batched_gemm/batched_gemm.cpp | 2 +- .../ops/gemm/kernel/batched_gemm_kernel.hpp | 12 +- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 41 +-- .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 287 +++++++++++++++--- .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 11 +- .../batched_gemm/test_batched_gemm_util.hpp | 2 +- test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 7 +- 10 files changed, 285 insertions(+), 88 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index 81fbd96323..b667886f84 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -40,7 +40,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTile2DPartitioner; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; using CodegenGemmTraits = ck_tile::TileGemmTraits; diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp index 4500e3b4fd..3fdc4ac46c 100644 --- a/example/ck_tile/03_gemm/gemm_basic.hpp +++ b/example/ck_tile/03_gemm/gemm_basic.hpp @@ -79,7 +79,7 @@ auto create_args(int argc, char* argv[]) .insert("n", "4096", "n dimension") .insert("k", "2048", "k dimension") .insert("a_layout", "R", "A tensor data layout - Row by default") - .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("b_layout", "C", "B tensor data layout - Column by default") .insert("c_layout", "R", "C tensor data layout - Row by default") .insert("stride_a", "0", "Tensor A stride") .insert("stride_b", "0", "Tensor B stride") diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index fb43e6f504..eaaf3dbed9 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -50,7 +50,9 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& constexpr bool TransposeC = false; - constexpr int kBlockPerCu = 1; + constexpr int kBlockPerCu = 1; + constexpr ck_tile::index_t TileParitionerGroupNum = 8; + constexpr ck_tile::index_t TileParitionerM01 = 4; // =============================================== @@ -58,7 +60,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& ck_tile::TileGemmShape, ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTile2DPartitioner; + using TilePartitioner = ck_tile:: + GemmSpatiallyLocalTilePartitioner; using Traits = ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile:: diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index 2a1cd58255..949621e116 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -43,7 +43,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTile2DPartitioner; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; using CodegenGemmTraits = ck_tile::TileGemmTraits; diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 4b4a4d7a09..0f8bec3cf4 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -70,7 +70,7 @@ struct BatchedGemmKernel : public GemmKernelRunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); } diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 774736e1fa..4c65f51914 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -75,12 +75,12 @@ struct GemmKernel static constexpr auto I1 = number<1>(); static constexpr auto I2 = number<2>(); - __host__ static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) + CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) { - return TilePartitioner::GridSize(M, N, KBatch); + return dim3(TilePartitioner::GridSize(M, N), 1, KBatch); } - __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); } struct GemmKernelArgs { @@ -93,7 +93,7 @@ struct GemmKernel index_t stride_A; index_t stride_B; index_t stride_C; - index_t KBatch; + index_t k_batch; }; CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs) @@ -121,7 +121,7 @@ struct GemmKernel const std::size_t k_id = blockIdx.z) { constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); - const index_t K_t = kargs.KBatch * K1; + const index_t K_t = kargs.k_batch * K1; const index_t KRead = (kargs.K + K_t - 1) / K_t * K1; if constexpr(std::is_same_v) @@ -142,13 +142,13 @@ struct GemmKernel b_k_split_offset = k_id * KRead; } - if(k_id < static_cast(kargs.KBatch - 1)) + if(k_id < static_cast(kargs.k_batch - 1)) { splitted_k = KRead; } else { - splitted_k = kargs.K - KRead * (kargs.KBatch - 1); + splitted_k = kargs.K - KRead * (kargs.k_batch - 1); } } @@ -162,7 +162,7 @@ struct GemmKernel if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 && is_any_of::value) { - if(kargs.KBatch != 1) + if(kargs.k_batch != 1) { std::cerr << "Conditions not met for Kbatch >1 !" << std::endl; return false; @@ -489,19 +489,14 @@ struct GemmKernel // Run Epilogue Pipeline auto& c_block_window = gemm_tile_windows.at(I2); - if constexpr(DstInMemOp == memory_operation_enum::set || - !(EpiloguePipeline::GetVectorSizeC() % 2 != 0 && - is_any_of::value)) - { - EpiloguePipeline{} - .template operator()( - c_block_window, c_block_tile, smem_ptr); - } + EpiloguePipeline{} + .template operator()( + c_block_window, c_block_tile, smem_ptr); } CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const { - const auto [iM, iN] = TilePartitioner::GetOutputTileIndex(blockIdx.x, blockIdx.y); + const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x); const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); @@ -516,14 +511,20 @@ struct GemmKernel // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; - if(kargs.KBatch == 1) + if(kargs.k_batch == 1) { RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); } else { - RunGemm( - a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + // Do not compile in case where we have unsupported + // VectorSizeC & data type configuration. + if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 && + is_any_of::value)) + { + RunGemm( + a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + } } } }; diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index eb2b817db6..d8c0239153 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -1,13 +1,21 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +/** + * @file + * GemmTilePartitioner allows customized mapping between a workgroup and the C-tile it computes. + */ + #pragma once #include "ck_tile/core.hpp" namespace ck_tile { -/** @brief Struct representing 2D block index mapping into 3D output tile space. */ +/** + * @brief Class providing 2D workgroup index mapping into 2D output GEMM C-tile space. + * + */ template struct GemmTile2DPartitioner { @@ -17,21 +25,32 @@ struct GemmTile2DPartitioner static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; - /** @brief Returns 3D grid size. */ - CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t batch_size) noexcept( - noexcept(MPerBlock != 0 && NPerBlock != 0)) -> dim3 + CK_TILE_HOST_DEVICE GemmTile2DPartitioner() noexcept = delete; + CK_TILE_HOST_DEVICE GemmTile2DPartitioner([[maybe_unused]] index_t M, + [[maybe_unused]] index_t N) noexcept; + + /** + * @brief Calculates GEMM kernel grid size. + * + * @param M GEMM's M dimension. + * @param N GEMM's N dimension. + * @return dim3 Structure holding grid's X,Y and Z dimensions. + */ + CK_TILE_HOST static auto + GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> dim3 { const index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; const index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; - const index_t GridDimZ = batch_size; - return dim3(GridDimX, GridDimY, GridDimZ); + return dim3(GridDimX, GridDimY, 1); } /** - * @brief Returns the number of loops. - * @param [in] K is dimension + * @brief Calculate number of loop iterations over GEMM's K dimension. + * + * @param K GEMM's K dimension. + * @return index_t The number of loop iterations over K dimension. */ - CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) noexcept -> index_t + CK_TILE_HOST_DEVICE static auto GetLoopNum(index_t K) noexcept -> index_t { return integer_divide_ceil(K, KPerBlock); } @@ -42,8 +61,15 @@ struct GemmTile2DPartitioner * @param [in] blockIdy is blockIdx.y * @return Returns the output tile indexes. */ - CK_TILE_DEVICE static constexpr auto GetOutputTileIndex(index_t blockIdx, - index_t blockIdy) noexcept + + /** + * @brief Calculate workgroup 2D index mapping into 2D output C-tile space. + * + * @param blockIdx WGP's X index. + * @param blockIdy WGP's Y index. + * @return const tuple Tuple containing 2D output C-tile index. + */ + CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple { const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx); @@ -53,61 +79,71 @@ struct GemmTile2DPartitioner }; /** - * @brief Struct representing 1D block index mapping into 2D output tile space. + * @brief Class providing 1D WGP index mapping into 2D output C-tile space. + * + * @tparam BlockGemmShape_ A class providing basic GEMM parameters. \link TileGemmShape */ -template +template struct GemmTile1DPartitioner { - using BlockGemmShape = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; - /** @brief delete default ctr with no any object */ - constexpr GemmTile1DPartitioner() noexcept = delete; + CK_TILE_HOST_DEVICE GemmTile1DPartitioner() noexcept = delete; - /** @brief constructs an object that does contain a N value. */ - constexpr GemmTile1DPartitioner(index_t N) noexcept { N_ = N; } + /** + * @brief Construct a new GemmTile1DPartitioner object. + * + * @param M GEMM's M dimension. + * @param N GEMM's N dimension. + */ + CK_TILE_HOST_DEVICE GemmTile1DPartitioner([[maybe_unused]] index_t M, index_t N) noexcept + { + N_ = N; + } - /** @brief Returns 1D grid size. */ - CK_TILE_HOST static constexpr auto - GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> dim3 + /** + * @brief Calculates GEMM kernel grid size. + * + * @param M GEMM's M dimension. + * @param N GEMM's N dimension. + * @return dim3 Structure holding grid's X,Y and Z dimensions. + */ + CK_TILE_HOST static auto + GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> index_t { const index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; const index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; - return dim3(GridDimX * GridDimY, 1, 1); + return GridDimX * GridDimY; } /** - * @brief Returns the number of blocks in N. - * @param [in] N is dimension + * @brief Calculate number of loop iterations over GEMM's K dimension. + * + * @param K GEMM's K dimension. + * @return index_t The number of loop iterations over K dimension. */ - CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N) noexcept -> index_t - { - return integer_divide_ceil(N, NPerBlock); - } - - /** - * @brief Returns the number of loops. - * @param [in] K is dimension - */ - CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) noexcept -> index_t + CK_TILE_HOST_DEVICE static auto GetLoopNum(index_t K) noexcept -> index_t { return integer_divide_ceil(K, KPerBlock); } /** - * @brief The function returns 2D output tile space. - * @param [in] blockIdx is blockIdx.x - block_start. - * */ - CK_TILE_DEVICE static constexpr auto GetOutputTileIndex(index_t blockIdx) noexcept + * @brief Calculate workgroup 1D index mapping into 2D output C-tile space. + * + * @param blockIdx WGP's index. + * @return const tuple Tuple containing 2D output C-tile index. + */ + CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx) noexcept -> const tuple { - const index_t NBlock = GetNBlock(N_); + const index_t NBlocks = integer_divide_ceil(N_, NPerBlock); - const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlock); - const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - (iM)*NBlock); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks); return make_tuple(iM, iN); } @@ -141,21 +177,176 @@ struct HasFnOneArgImpl().GetOutputTileIn * enable-if `GetOutputTileIndex`-fn is std::true_type when `GetOutputTileIndex`-fn is well-formed, * otherwise std::false_type. */ -template {}>> +template {}>> struct OffsettedTile1DPartitioner { /** * @brief The function subtracts the block's start (offset) from 1D raw-indexes. - * @param [in] block_start is `blockIdx.x - block_start`. - * @return Returns a `tuple` [Im, In] shifted index, used to shift 1d-tile index. + * @param [in] block_start Workgroup offset. + * @param [in] M Gemm's M dimension. + * @param [in] N Gemm's N dimension. + * @return Returns a `tuple` [Im, In] with shifted index. */ - [[nodiscard]] CK_TILE_DEVICE static constexpr auto GetOffsetedTileIndex(index_t block_start, - index_t N) noexcept + [[nodiscard]] CK_TILE_DEVICE static auto + GetOffsetedTileIndex(index_t block_start, index_t M, index_t N) noexcept -> const tuple { - const auto [iM, iN] = PartitionerFn(N).GetOutputTileIndex(blockIdx.x - block_start); + const auto [iM, iN] = TilePartitioner{M, N}.GetOutputTileIndex(blockIdx.x - block_start); return make_tuple(iM, iN); } }; + +/** + * @brief Class mapping 1D block index into 2D output tile space. + * + * @note It groups spatially workgroups in order to better utilize caches. + * It is using grouped Rows of column-vectors WGP pattern. It's optimized + * for gfx94x-like multiple-die chip. + * + * @tparam GroupNum - The number of big groups. + * @tparam M01 - The number of groups in M dim within spatially local WGPs, + * + */ +template +struct GemmSpatiallyLocalTilePartitioner +{ + using BlockGemmShape = remove_cvref_t; + + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; + + CK_TILE_HOST_DEVICE GemmSpatiallyLocalTilePartitioner() noexcept = delete; + CK_TILE_HOST_DEVICE GemmSpatiallyLocalTilePartitioner(index_t M_, index_t N_) noexcept + : M(M_), N(N_) + { + } + + /** + * @brief Calculates GEMM kernel grid size. + * + * @param M GEMM's M dimension. + * @param N GEMM's N dimension. + * @return index_t A total number of workgroups. + */ + CK_TILE_HOST static auto + GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> index_t + { + const index_t GridDimX = integer_divide_ceil(M, MPerBlock); + const index_t GridDimY = integer_divide_ceil(N, NPerBlock); + return GridDimX * GridDimY; + } + + /** + * @brief Calculate number of loop iterations over GEMM's K dimension. + * + * @param K GEMM's K dimension. + * @return index_t The number of loop iterations over K dimension. + */ + CK_TILE_HOST_DEVICE static auto GetLoopNum(index_t K) noexcept -> index_t + { + return integer_divide_ceil(K, KPerBlock); + } + + /** + * @brief Calculate workgroup 1D index mapping into 2D output C-tile space. + * + * @param [in] block_1d_id WGP's index. + * @return const tuple Tuple containing 2D output C-tile index. + */ + CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_1d_id) noexcept + -> const tuple + { + const auto M0 = integer_divide_ceil(M, MPerBlock); + const auto N0 = integer_divide_ceil(N, NPerBlock); + + if(M0 == 1) + { + return make_tuple(0, block_1d_id); + } + else if(N0 == 1) + { + return make_tuple(block_1d_id, 0); + } + // block_1d_id = block_1d_id % (M0 * N0); // swallow batch index + else + { + const auto group_size = integer_divide_ceil(M0 * N0, GroupNum); + const auto big_group_num = GroupNum - (group_size * GroupNum - M0 * N0); + const auto group_id_y = block_1d_id / GroupNum; + const auto group_id_x = block_1d_id - group_id_y * GroupNum; + const auto remap_block_1d_id = + group_id_x <= big_group_num + ? group_id_x * group_size + group_id_y + : group_id_x * group_size + big_group_num - group_id_x + group_id_y; + + const index_t idx_M0 = remap_block_1d_id / N0; + const index_t idx_N0 = remap_block_1d_id - idx_M0 * N0; + + const index_t M0_tmp = M0 / M01; + const index_t M0_mod_M01 = M0 - M0_tmp * M01; + + const auto M01_adapt = (idx_M0 < M0 - M0_mod_M01) ? M01 : M0_mod_M01; + + const index_t idx_M00 = idx_M0 / M01; + const index_t idx_M01 = idx_M0 - idx_M00 * M01; + const index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0; + + /** + * idxN0 + * + * |< mtx N >| + * + * NPerBlock NPerBlock NPerBlock NPerBlock + * N_0 N_1 N_2 N_3 + * - |-----------|-----------|-----------|-----|-----|- + * ^ | - - 0 |/----> 2 | | | | + * | | | / | | | | | M_0 MPerBlock + * | M | /| | | | | | + * |-0---|---/-|-----|-----|-----------|-----|-----|- + * | 1 | / | | | blockid | | | + * idxM0 | | | / | V | 5 | | | M_1 MPerBlock + * | - V 1 | - 3 | | | | + * |-----------|-----------|-----------|-----|-----|- + * mtx M | | | | | | + * | | | | | | M_2 MPerBlock + * | | | | | | + * |-----------|-----------|-----------|-----|-----|- + * | | | | | | + * | | | | | | M_3 MPerBlock + * | | | | | | + * |-----------|-----------|-----------|-----|-----|- + * V | | | | | | + * - |-----------|-----------|-----------|-----|-----|- M_4 MPerBlock + * | | | | | | + * |-----------|-----------|-----------|-----|-----|- + * Example: + * assume: + * M0 = 5 + * N0 = 4 + * block_1d_id = 5 + * M01 = 2 + * + * idx_N0 = 1 + * idx_M0 = 1 + * M01_adapt = 2 + * idx_M00 = 0 + * idx_M01 = 1 + * idx_N0_M01_local = 5 + * output {1, 2} + */ + + const index_t N_out = idx_N0_M01_local / M01_adapt; + const index_t idx_loc_mod_M01 = idx_N0_M01_local - N_out * M01_adapt; + + return make_tuple(idx_loc_mod_M01 + idx_M00 * M01, N_out); + } + } + + private: + index_t M; + index_t N; +}; + } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index 656939770c..13d3df02f9 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -77,8 +77,8 @@ struct GroupedGemmKernel : public GemmKernel, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTile2DPartitioner; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; using CodegenGemmTraits = ck_tile::TileGemmTraits; diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index 647b54cb8e..dc685567eb 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -55,7 +55,9 @@ class TestCkTileGemmPipeline : public ::testing::Test // TODO: For now - but this should also be a test parameter constexpr bool TransposeC = false; - constexpr int kBlockPerCu = 1; + constexpr int kBlockPerCu = 1; + constexpr ck_tile::index_t TileParitionerGroupNum = 8; + constexpr ck_tile::index_t TileParitionerM01 = 4; // =============================================== @@ -63,7 +65,8 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::TileGemmShape, ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTile2DPartitioner; + using TilePartitioner = ck_tile:: + GemmSpatiallyLocalTilePartitioner; using Traits = ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile:: From 7cf8931677f792cbdf9f3d6516d9c8e2f3a229b2 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 30 Jan 2025 16:01:43 -0800 Subject: [PATCH 05/18] Enable ck_tile gemms build in CI by default. (#1850) * turn on the ck_tile gemm tests by default * enable ck_tile gemms CI build by default --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b212d2d0ab..835b7e724f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -795,7 +795,7 @@ pipeline { description: "Run the ck_tile FMHA tests (default: OFF)") booleanParam( name: "RUN_CK_TILE_GEMM_TESTS", - defaultValue: false, + defaultValue: true, description: "Run the ck_tile GEMM tests (default: ON)") booleanParam( name: "BUILD_INSTANCES_ONLY", From 2ab8bf4c12ba99854afc406ad24626080ee1acd1 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 31 Jan 2025 09:42:43 -0800 Subject: [PATCH 06/18] fix ck_tile gemm scripts (#1851) --- example/ck_tile/03_gemm/script/benchmark_basic.sh | 4 ++-- example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/example/ck_tile/03_gemm/script/benchmark_basic.sh b/example/ck_tile/03_gemm/script/benchmark_basic.sh index f5473e46f4..6c6049ef8b 100755 --- a/example/ck_tile/03_gemm/script/benchmark_basic.sh +++ b/example/ck_tile/03_gemm/script/benchmark_basic.sh @@ -1,12 +1,12 @@ #!/bin/sh EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)" -VALID=0 +VALID=1 for b_matrix_layout in "R" "C"; do for m in "64" "512" "1024" "2048"; do for n in "512" "1024" "2048"; do for k in "64" "512" "1024" "2048"; do - $EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID + $EXE -prec=fp16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID done done done diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh index a3029cbeb5..8ff7d7ad44 100755 --- a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh +++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh @@ -1,12 +1,12 @@ #!/bin/sh EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)" -VALID=0 +VALID=1 for b_matrix_layout in "R" "C"; do for m in "64" "512" "1024" "2048"; do for n in "512" "1024" "2048"; do for k in "64" "512" "1024" "2048"; do - $EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID + $EXE -prec=fp16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID done done done From 2e3183af4f2c8f15650eacb6a42eac6df1340141 Mon Sep 17 00:00:00 2001 From: arai713 <67439843+arai713@users.noreply.github.com> Date: Fri, 31 Jan 2025 09:48:39 -0800 Subject: [PATCH 07/18] Codegen hipRTC compilation (#1579) * updating codegen build for MIOpen access: adding .cmake for codegen component * updating CMake * adding in header guards for some headers due to issues with hiprtc compilation in MIOpen * some more header guards * putting env file in header guard * cleaning up some includes * updated types file for hiprtc purposes * fixed types file: bit-wise/memcpy issue * updating multiple utility files to deal with standard header inclusion for hiprtc * added some more header guards in the utility files, replacing some standard header functionality * added some more header guards * fixing some conflicts in utility files, another round of header guards * fixing errors in data type file * resolved conflict errors in a few utility files * added header guards/replicated functionality in device files * resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd * resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp * added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp * fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl * replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector * resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit * added replicated functionality for standard header methods in utility files * replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp * temp fix for namespace error in MIOpen * remove standard header usage in codegen device op * removed standard header usage in elementwise files, resolved namespace errors * formatting fix * changed codegen argument to ON for testing * temporarily removing codegen compiler flag for testing purposes * added codegen flag again, set default to ON * set codegen flag default back to OFF * replaced enable_if_t standard header usage in data_type.hpp * added some debug prints to pinpoint issues in MIOpen * added print outs to debug in MIOpen * removed debug print outs from device op * resolved stdexcept include error * formatting fix * adding includes to new fp8 file to resolve ck::enable_if_t errors * made changes to amd_wave_read_first_lane * updated functionality in type utility file * fixed end of file issue * resovled errors in type utility file, added functionality to array utility file * fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x * formatting fix * replaced standard header usage in amd_ck_fp8 file * added include to random_gen file * removed and replicated standard header usage from data_type and type_convert files for fp8 changes * replicated standard unsigned integer types in random_gen * resolved comments from review: put calls to reinterpret_cast for size_t in header guards * updated/added copyright headers * removed duplicate header * fixed typo in header guard * updated copyright headers --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- CMakeLists.txt | 2 +- codegen/driver/main.cpp | 2 + codegen/src/headers.cpp | 3 + codegen/src/types.cpp | 3 + codegen/test/gemm_multiple_d.cpp | 3 + .../test/grouped_conv_fwd_multiple_d_v1.cpp | 3 + .../test/grouped_conv_fwd_multiple_d_v2.cpp | 3 + .../test/grouped_conv_fwd_multiple_d_v3.cpp | 3 + .../test/grouped_conv_fwd_multiple_d_v4.cpp | 3 + codegen/test/include/common.hpp | 3 + .../test/rtc/include/rtc/compile_kernel.hpp | 3 + codegen/test/rtc/include/rtc/hip.hpp | 5 +- codegen/test/rtc/include/rtc/kernel.hpp | 3 + codegen/test/rtc/include/rtc/manage_ptr.hpp | 3 + codegen/test/rtc/include/rtc/tmp_dir.hpp | 3 + codegen/test/rtc/src/compile_kernel.cpp | 3 + codegen/test/rtc/src/hip.cpp | 3 + codegen/test/rtc/src/kernel.cpp | 4 + codegen/test/rtc/src/tmp_dir.cpp | 3 + include/ck/ck.hpp | 4 +- ...hread_group_tensor_slice_transfer_v7r2.hpp | 4 +- .../convolution_forward_specialization.hpp | 6 +- .../gpu/device/device_base.hpp | 13 +- .../device_grouped_conv_fwd_multiple_abd.hpp | 22 +- .../gpu/device/gemm_specialization.hpp | 4 +- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 57 +-- .../impl/device_column_to_image_impl.hpp | 1 + .../impl/device_gemm_xdl_splitk_c_shuffle.hpp | 4 +- ...m_xdl_splitk_c_shuffle_lds_direct_load.hpp | 4 +- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 7 +- ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 1 + .../impl/device_image_to_column_impl.hpp | 1 + .../gpu/device/tensor_layout.hpp | 2 + .../element/binary_element_wise_operation.hpp | 6 +- .../gpu/element/element_wise_operation.hpp | 4 +- .../element/unary_element_wise_operation.hpp | 131 +++---- .../gpu/grid/block_to_ctile_map.hpp | 10 +- ...ridwise_gemm_multiple_abd_xdl_cshuffle.hpp | 53 ++- .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp | 14 +- .../grid/gridwise_gemm_pipeline_selector.hpp | 9 +- .../threadwise_tensor_slice_transfer.hpp | 4 +- .../transform_conv_fwd_to_gemm.hpp | 137 ++++--- include/ck/utility/amd_buffer_addressing.hpp | 16 +- include/ck/utility/amd_ck_fp8.hpp | 20 +- .../ck/utility/amd_wave_read_first_lane.hpp | 27 +- include/ck/utility/array.hpp | 6 +- include/ck/utility/container_helper.hpp | 6 +- include/ck/utility/data_type.hpp | 275 +++++++++++-- include/ck/utility/debug.hpp | 3 +- include/ck/utility/enable_if.hpp | 19 +- include/ck/utility/env.hpp | 4 +- include/ck/utility/functional.hpp | 6 +- include/ck/utility/functional4.hpp | 12 +- include/ck/utility/integral_constant.hpp | 7 +- include/ck/utility/is_detected.hpp | 16 +- include/ck/utility/loop_scheduler.hpp | 7 +- include/ck/utility/magic_division.hpp | 6 +- include/ck/utility/math_v2.hpp | 6 +- include/ck/utility/random_gen.hpp | 26 +- include/ck/utility/sequence.hpp | 6 +- .../statically_indexed_array_multi_index.hpp | 41 +- include/ck/utility/tuple.hpp | 16 +- include/ck/utility/tuple_helper.hpp | 14 +- include/ck/utility/type.hpp | 365 +++++++++++++++--- include/ck/utility/type_convert.hpp | 44 ++- 65 files changed, 1119 insertions(+), 385 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86ad9d39d8..20365a6130 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ if(DPP_KERNELS) endif() option(CK_USE_CODEGEN "Enable codegen library" OFF) if(CK_USE_CODEGEN) - add_definitions(-DCK_USE_CODEGEN) + add_definitions(-DCK_USE_CODEGEN) endif() option(CK_TIME_KERNEL "Enable kernel time tracking" ON) diff --git a/codegen/driver/main.cpp b/codegen/driver/main.cpp index c7d295de94..7b878d0d57 100644 --- a/codegen/driver/main.cpp +++ b/codegen/driver/main.cpp @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/codegen/src/headers.cpp b/codegen/src/headers.cpp index 5b0c929db3..452cd99846 100644 --- a/codegen/src/headers.cpp +++ b/codegen/src/headers.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/headers.hpp" #include "ck_headers.hpp" diff --git a/codegen/src/types.cpp b/codegen/src/types.cpp index a8a8b10c04..9aa5d39fae 100644 --- a/codegen/src/types.cpp +++ b/codegen/src/types.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/types.hpp" #include "ck/host/stringutils.hpp" #include diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp index bd7ef463fb..9e2d990d9b 100644 --- a/codegen/test/gemm_multiple_d.cpp +++ b/codegen/test/gemm_multiple_d.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_gemm_multiple_d/problem.hpp" #include "ck/host/device_gemm_multiple_d/operation.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp index 50290fa25a..9902caab04 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp index b558d97c78..205283e7aa 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp index e2972a93d2..2b83af2432 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp index b728096c51..fbe27e9c8b 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/include/common.hpp b/codegen/test/include/common.hpp index 99d4c64973..24fde2e523 100644 --- a/codegen/test/include/common.hpp +++ b/codegen/test/include/common.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #pragma once #include #include diff --git a/codegen/test/rtc/include/rtc/compile_kernel.hpp b/codegen/test/rtc/include/rtc/compile_kernel.hpp index c4413b47be..a49714f7c6 100644 --- a/codegen/test/rtc/include/rtc/compile_kernel.hpp +++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL diff --git a/codegen/test/rtc/include/rtc/hip.hpp b/codegen/test/rtc/include/rtc/hip.hpp index e962d4cd3e..af2f4a9122 100644 --- a/codegen/test/rtc/include/rtc/hip.hpp +++ b/codegen/test/rtc/include/rtc/hip.hpp @@ -1,10 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP #include #include -#include #include +#include namespace rtc { diff --git a/codegen/test/rtc/include/rtc/kernel.hpp b/codegen/test/rtc/include/rtc/kernel.hpp index 9f38e90416..b1ee729f77 100644 --- a/codegen/test/rtc/include/rtc/kernel.hpp +++ b/codegen/test/rtc/include/rtc/kernel.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL diff --git a/codegen/test/rtc/include/rtc/manage_ptr.hpp b/codegen/test/rtc/include/rtc/manage_ptr.hpp index 92edf12628..52b94d4b70 100644 --- a/codegen/test/rtc/include/rtc/manage_ptr.hpp +++ b/codegen/test/rtc/include/rtc/manage_ptr.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER diff --git a/codegen/test/rtc/include/rtc/tmp_dir.hpp b/codegen/test/rtc/include/rtc/tmp_dir.hpp index a0a2cb9b77..2f3b26cc43 100644 --- a/codegen/test/rtc/include/rtc/tmp_dir.hpp +++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp index 8cb71b9043..5a70f898e8 100644 --- a/codegen/test/rtc/src/compile_kernel.cpp +++ b/codegen/test/rtc/src/compile_kernel.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/codegen/test/rtc/src/hip.cpp b/codegen/test/rtc/src/hip.cpp index 747f83e3ba..6f16e36720 100644 --- a/codegen/test/rtc/src/hip.cpp +++ b/codegen/test/rtc/src/hip.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/codegen/test/rtc/src/kernel.cpp b/codegen/test/rtc/src/kernel.cpp index 9fe38e84ad..982e95de17 100644 --- a/codegen/test/rtc/src/kernel.cpp +++ b/codegen/test/rtc/src/kernel.cpp @@ -1,6 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include +#include #include // extern declare the function since hip/hip_ext.h header is broken diff --git a/codegen/test/rtc/src/tmp_dir.cpp b/codegen/test/rtc/src/tmp_dir.cpp index 4e89bc3539..b36b17cce1 100644 --- a/codegen/test/rtc/src/tmp_dir.cpp +++ b/codegen/test/rtc/src/tmp_dir.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index fc9d074716..fa49f6ddd5 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -5,7 +5,7 @@ #include "ck/config.h" #include "ck/utility/env.hpp" - +#ifndef CK_CODE_GEN_RTC #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" @@ -14,7 +14,7 @@ // environment variable to enable logging: // export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) - +#endif // to do: add various levels of logging with CK_LOG_LEVEL #ifndef CK_TIME_KERNEL diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp index 1c4de5ed31..0a0bcbac38 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -131,7 +131,7 @@ struct ThreadGroupTensorSliceTransfer_v7r2 } template - using is_tuple = decltype(std::declval().IsTuple()); + using is_tuple = decltype(ck::declval().IsTuple()); template __device__ void RunWrite(const DstDescs& dst_descs, diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp index 0eef827a5b..cf20025d46 100644 --- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp +++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp @@ -1,9 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include +#endif namespace ck { namespace tensor_operation { @@ -18,6 +20,7 @@ enum struct ConvolutionForwardSpecialization Filter3x3, }; +#ifndef CK_CODE_GEN_RTC inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s) { switch(s) @@ -30,6 +33,7 @@ inline std::string getConvForwardSpecializationString(const ConvolutionForwardSp default: return "Unrecognized specialization!"; } } +#endif } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index 736e241fdf..774982d905 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -1,19 +1,21 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include #include #include #include - #include "ck/stream_config.hpp" +#endif namespace ck { namespace tensor_operation { namespace device { +#ifndef CK_CODE_GEN_RTC #define GET_OBJECT_NAME_IMLP \ std::optional GetObjectName() const override \ { \ @@ -41,7 +43,9 @@ namespace device { } #define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL +#endif +#ifndef CK_CODE_GEN_RTC struct BaseArgument { BaseArgument() = default; @@ -66,13 +70,14 @@ struct BaseInvoker virtual ~BaseInvoker() {} }; +#endif struct BaseOperator { BaseOperator() = default; BaseOperator(const BaseOperator&) = default; BaseOperator& operator=(const BaseOperator&) = default; - +#ifndef CK_CODE_GEN_RTC virtual bool IsSupportedArgument(const BaseArgument*) { return false; } virtual std::string GetTypeString() const { return ""; } @@ -100,7 +105,7 @@ struct BaseOperator assert(p_arg); p_arg->p_workspace_ = p_workspace; } - +#endif virtual ~BaseOperator() {} }; diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp index 184efbbd68..8c9b768a8b 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp @@ -1,9 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include +#endif #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" @@ -13,8 +15,13 @@ namespace ck { namespace tensor_operation { namespace device { +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif /** * \brief Grouped Convolution Forward @@ -72,12 +79,18 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator static constexpr index_t NumDTensor = DsDataType::Size(); static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor"); - +#ifdef CK_CODE_GEN_RTC + using APointers = ck::conditional_t&, const void*>; + using BPointers = ck::conditional_t&, const void*>; +#else // If DataType is tuple, user has to pass std::array with pointers. using APointers = - std::conditional_t&, const void*>; + ck::conditional_t&, const void*>; using BPointers = - std::conditional_t&, const void*>; + ck::conditional_t&, const void*>; +#endif + +#ifndef CK_CODE_GEN_RTC /** * \brief Make argument pointer for grouped conv fwd. @@ -150,6 +163,7 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator const CDEElementwiseOperation& cde_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; +#endif }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp index 0bb45b18c3..997dcb75a6 100644 --- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp +++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -29,6 +29,7 @@ enum struct GemmSpecialization MNKOPadding, }; +#ifndef CK_CODE_GEN_RTC inline std::string getGemmSpecializationString(const GemmSpecialization& s) { switch(s) @@ -52,6 +53,7 @@ inline std::string getGemmSpecializationString(const GemmSpecialization& s) default: return "Unrecognized specialization!"; } } +#endif } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 180e32c8b6..d9c4e22049 100644 --- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -3,11 +3,17 @@ #pragma once +#ifndef CK_CODE_GEN_RTC #include #include #include #include #include +#include + +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#endif #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" @@ -15,15 +21,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/io.hpp" namespace ck { namespace tensor_operation { @@ -259,8 +262,13 @@ __global__ void } // namespace +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif // // @brief Device Convolution operation. @@ -429,8 +437,8 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // If we are using multiAB and one of the template datatype parameters is not a tuple, convert // it to it - using GemmADataType = std::conditional_t, ADataType>; - using GemmBDataType = std::conditional_t, BDataType>; + using GemmADataType = ck::conditional_t, ADataType>; + using GemmBDataType = ck::conditional_t, BDataType>; #define GridwiseGemmTemplateParameters \ GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ @@ -449,15 +457,13 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle CDEBlockTransferScalarPerVector_NPerBlock, LoopSched // Use appropriate gridwise gemm using GridwiseGemm = - std::conditional_t, - GridwiseGemmMultipleD_xdl_cshuffle>; + ck::conditional_t, + GridwiseGemmMultipleD_xdl_cshuffle>; // If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers. - using APointers = - std::conditional_t&, const void*>; - using BPointers = - std::conditional_t&, const void*>; + using APointers = ck::conditional_t&, const void*>; + using BPointers = ck::conditional_t&, const void*>; // Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not // in initializer list what is required for single const pointer). using AGridPointer = remove_cvref_t< @@ -812,7 +818,6 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle static_for<0, NumDTensor, 1>{}([&](auto i) { using DLayout = remove_cvref_t>; - // FIXME: layout if constexpr(is_same_v || is_same_v || is_same_v || is_same_v || @@ -965,18 +970,18 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle const BElementwiseOperation& b_element_op, const CDEElementwiseOperation& cde_element_op) { - std::array a_g_n_c_wis_lengths_i32; - std::array a_g_n_c_wis_strides_i32; - std::array b_g_k_c_xs_lengths_i32; - std::array b_g_k_c_xs_strides_i32; - std::array, NumDTensor> ds_g_n_k_wos_lengths_i32; - std::array, NumDTensor> ds_g_n_k_wos_strides_i32; - std::array e_g_n_k_wos_lengths_i32; - std::array e_g_n_k_wos_strides_i32; - std::array conv_filter_strides_i32; - std::array conv_filter_dilations_i32; - std::array input_left_pads_i32; - std::array input_right_pads_i32; + ck::Array a_g_n_c_wis_lengths_i32; + ck::Array a_g_n_c_wis_strides_i32; + ck::Array b_g_k_c_xs_lengths_i32; + ck::Array b_g_k_c_xs_strides_i32; + ck::Array, NumDTensor> ds_g_n_k_wos_lengths_i32; + ck::Array, NumDTensor> ds_g_n_k_wos_strides_i32; + ck::Array e_g_n_k_wos_lengths_i32; + ck::Array e_g_n_k_wos_strides_i32; + ck::Array conv_filter_strides_i32; + ck::Array conv_filter_dilations_i32; + ck::Array input_left_pads_i32; + ck::Array input_right_pads_i32; array_convert(a_g_n_c_wis_lengths_i32, a_g_n_c_wis_lengths); array_convert(a_g_n_c_wis_strides_i32, a_g_n_c_wis_strides); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp index e4203e0313..9482812f75 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/library/utility/numeric.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp index 7f28ec7680..2666051c86 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp @@ -205,8 +205,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK #include +#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -212,9 +213,13 @@ __global__ void } } // namespace - +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif // // @brief Device Convolution operation. diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp index 589a0daa99..85d1ba8f48 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp @@ -9,6 +9,7 @@ #include #include +#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp index 648736fcbf..1ad37058db 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/library/utility/numeric.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index 2202bc5695..85adb64b43 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout } // namespace convolution +#ifndef CK_CODE_GEN_RTC template < typename Layout, typename std::enable_if::value, bool>::type = false> @@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&) os << Layout::name; return os; } +#endif } // namespace tensor_layout } // namespace ck diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index c87c90a91d..530876650e 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -340,8 +340,8 @@ struct Bilinear }; template <> - __host__ __device__ constexpr void operator()( - std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const + __host__ __device__ constexpr void + operator()(int8_t& y, const int32_t& x0, const int8_t& x1) const { y = type_convert(alpha_ * type_convert(x0) + beta_ * type_convert(x1)); diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index b914c0b96f..370d03258d 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -533,7 +533,7 @@ struct NormalizeInInfer const T3& gamma, const T4& beta) const { - static_assert(std::is_same::value || std::is_same::value, + static_assert(is_same::value || is_same::value, "Data type is not supported by this operation!"); using ck::type_convert; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 5e522fb2ea..139f0057e4 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -252,7 +252,7 @@ struct PassThroughPack2 template __host__ __device__ void operator()(Y& y, const X& x) const; - __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::f8x2_t& x) const + __host__ __device__ constexpr void operator()(half2_t& y, const f8x2_t& x) const { auto t = type_convert(x); y = type_convert(t); @@ -479,7 +479,7 @@ struct PassThrough template <> __host__ __device__ void operator()(bf8_t& y, const half_t& x) const { - y = ck::type_convert(x); + y = type_convert(x); } }; @@ -552,21 +552,21 @@ struct Scale template __host__ __device__ void operator()(Y& y, const X& x) const { - y = ck::type_convert(ck::type_convert(x) * scale_); + y = type_convert(type_convert(x) * scale_); } template <> __host__ __device__ void operator()(half_t& y, const half_t& x) const { - y = ck::type_convert(scale_) * x; + y = type_convert(scale_) * x; }; template <> __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { - const float x_tmp = ck::type_convert(x); + const float x_tmp = type_convert(x); const float y_tmp = scale_ * x_tmp; - y = ck::type_convert(y_tmp); + y = type_convert(y_tmp); }; template <> @@ -584,7 +584,7 @@ struct Scale template <> __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { - y = ck::type_convert(scale_ * ck::type_convert(x)); + y = type_convert(scale_ * type_convert(x)); }; float scale_; @@ -600,7 +600,7 @@ struct ScaleAndResetNaNToMinusInfinity template <> __host__ __device__ void operator()(float& y, const float& x) const { - y = ck::math::isnan(x) ? -ck::NumericLimits::Infinity() : scale_ * x; + y = math::isnan(x) ? -NumericLimits::Infinity() : scale_ * x; }; float scale_; @@ -671,12 +671,13 @@ struct UnaryAbs template __host__ __device__ void operator()(T& y, const T& x) const { + static_assert(is_same::value || is_same::value || is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::abs(x); + y = math::abs(x); }; template <> @@ -694,7 +695,7 @@ struct UnarySqrt static_assert(is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sqrt(x); + y = math::sqrt(x); }; }; @@ -713,9 +714,9 @@ struct Relu template <> __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { - float x_f32 = ck::type_convert(x); + float x_f32 = type_convert(x); float y_f32 = x_f32 > 0 ? x_f32 : 0; - y = ck::type_convert(y_f32); + y = type_convert(y_f32); } }; @@ -731,7 +732,7 @@ struct FastGelu template __device__ void operator()(Y& y, const X& x) const; - +#ifndef CK_CODE_GEN_RTC template <> __host__ void operator()(float& y, const float& x) const { @@ -742,6 +743,7 @@ struct FastGelu const float emu = exp(u); y = x / (1.f + emu); } +#endif // device code, use lower precision "__ocml_exp_f32" and "rcp" template <> @@ -753,7 +755,7 @@ struct FastGelu const float u = x * (c1 * x * x + c2); const float emu = __ocml_exp_f32(u); - y = x * ck::math::rcp(1.f + emu); + y = x * math::rcp(1.f + emu); } template <> @@ -851,10 +853,9 @@ struct Gelu } template <> - __host__ __device__ void operator()(ck::half_t& y, - const ck::half_t& x) const + __host__ __device__ void operator()(half_t& y, const half_t& x) const { - y = ck::half_t(0.5) * x * (ck::half_t(1) + ck::half_t(erf(float(0.70710678118f * x)))); + y = half_t(0.5) * x * (half_t(1) + half_t(erf(float(0.70710678118f * x)))); } }; @@ -868,7 +869,7 @@ struct Sigmoid is_same::value, "Data type is not supported by this operation!"); constexpr T one = type_convert(1); - y = one / (one + ck::math::exp(-x)); + y = one / (one + math::exp(-x)); }; }; @@ -877,11 +878,11 @@ struct Silu template __host__ __device__ void operator()(T& y, const T& x) const { - static_assert(is_same_v || is_same_v || is_same_v || + static_assert(is_same_v || is_same_v || is_same_v || is_same_v || is_same_v, "Data type is not supported by this operation!"); constexpr T one = type_convert(1); - y = x * (one / (one + ck::math::exp(-x))); + y = x * (one / (one + math::exp(-x))); }; }; @@ -895,7 +896,7 @@ struct TanH is_same::value, "Data type is not supported by this operation!"); - y = ck::math::tanh(x); + y = math::tanh(x); }; }; @@ -905,11 +906,11 @@ struct ACos __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::acos(x); + y = math::acos(x); }; }; @@ -919,11 +920,11 @@ struct Neg __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::neg(x); + y = math::neg(x); }; }; @@ -933,11 +934,11 @@ struct ATan __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::atan(x); + y = math::atan(x); }; }; @@ -947,11 +948,11 @@ struct Sin __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sin(x); + y = math::sin(x); }; }; @@ -961,11 +962,11 @@ struct ASinH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::asinh(x); + y = math::asinh(x); }; }; @@ -975,11 +976,11 @@ struct Cos __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::cos(x); + y = cos(x); }; }; @@ -989,11 +990,11 @@ struct ACosH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::acosh(x); + y = math::acosh(x); }; }; @@ -1003,11 +1004,11 @@ struct Tan __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::tan(x); + y = math::tan(x); }; }; @@ -1017,11 +1018,11 @@ struct ATanH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::atanh(x); + y = math::atanh(x); }; }; @@ -1031,11 +1032,11 @@ struct SinH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sinh(x); + y = math::sinh(x); }; }; @@ -1045,11 +1046,11 @@ struct Ceil __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::ceil(x); + y = math::ceil(x); }; }; @@ -1059,11 +1060,11 @@ struct Exp __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::exp(x); + y = math::exp(x); }; }; @@ -1073,11 +1074,11 @@ struct CosH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::cosh(x); + y = math::cosh(x); }; }; @@ -1087,11 +1088,11 @@ struct Floor __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::floor(x); + y = math::floor(x); }; }; @@ -1101,11 +1102,11 @@ struct Log __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::log(x); + y = math::log(x); }; }; @@ -1115,11 +1116,11 @@ struct ASin __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::asin(x); + y = math::asin(x); }; }; @@ -1129,11 +1130,11 @@ struct Rcp __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::rcp(x); + y = math::rcp(x); }; }; @@ -1153,7 +1154,7 @@ struct Swish "Data type is not supported by this operation!"); float bx = -beta_ * type_convert(x); - y = type_convert(x / (1.f + ck::math::exp(bx))); + y = type_convert(x / (1.f + math::exp(bx))); }; const float beta_; @@ -1172,7 +1173,7 @@ struct SoftRelu "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); constexpr T one = type_convert(1); - y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + y = math::log(one + math::exp(x * casted_alpha)) / casted_alpha; } const float alpha_; }; @@ -1193,7 +1194,7 @@ struct Power T casted_beta = type_convert(beta_); T casted_gamma = type_convert(gamma_); T shifted_scaled_x = casted_alpha + casted_beta * x; - y = ck::math::pow(shifted_scaled_x, casted_gamma); + y = math::pow(shifted_scaled_x, casted_gamma); } const float alpha_; const float beta_; @@ -1213,7 +1214,7 @@ struct ClippedRelu "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); T casted_beta = type_convert(beta_); - y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + y = math::min(casted_beta, math::max(casted_alpha, x)); } const float alpha_; const float beta_; @@ -1248,7 +1249,7 @@ struct Elu is_same::value, "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); - y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + y = x > 0 ? x : casted_alpha * math::expm1(x); } const float alpha_; }; @@ -1350,10 +1351,10 @@ struct FastNumericArrayConverter }; template <> -struct FastNumericArrayConverter +struct FastNumericArrayConverter { using InputArray = vector_type; - using OutputArray = vector_type; + using OutputArray = vector_type; __device__ static OutputArray convert(InputArray const& Input) { @@ -1383,13 +1384,13 @@ struct FastNumericArrayConverter }; template -struct FastNumericArrayConverter +struct FastNumericArrayConverter { static constexpr int VEC_WIDTH = 4; static_assert(!(N % VEC_WIDTH), "N must be multiple of 4."); using InputArray = vector_type; - using OutputArray = vector_type; + using OutputArray = vector_type; __device__ static OutputArray convert(InputArray const& Input) { @@ -1398,7 +1399,7 @@ struct FastNumericArrayConverter OutputArray Output; using Vec_InputArray = vector_type; - using Vec_OutputArray = vector_type; + using Vec_OutputArray = vector_type; Vec_OutputArray* half_4_ptr = reinterpret_cast(&Output); Vec_InputArray const* uint8_4_ptr = reinterpret_cast(&Input); diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp index 56c37b1b72..2bc9ef87ac 100644 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp @@ -1,14 +1,17 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck/utility/math.hpp" #include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" #include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp" +#ifndef CK_CODE_GEN_RTC #include #include +#endif namespace ck { @@ -978,8 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit // Create 3D grid const auto M0 = math::integer_divide_ceil(M, MPerBlock); const auto N0 = math::integer_divide_ceil(N, NPerBlock); - - return std::make_tuple(N0, M0, k_split); + return make_tuple(N0, M0, k_split); } template @@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK uint32_t dp_for_sk_iters = k_iters_per_tile.get(); uint32_t best_sk_score = - std::numeric_limits::max(); // we need to find the smallest sk iters + NumericLimits::Max(); // we need to find the smallest sk iters for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles; tentative_sk_blocks++) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp index 150dd98064..344656b13f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -423,10 +423,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeAsGridDescriptor_M_K(const std::array& MRaws, - const std::array& KRaws, - const std::array& AsStride) + __host__ __device__ static auto MakeAsGridDescriptor_M_K( +#ifdef CK_CODE_GEN_RTC + const ck::Array& MRaws, + const ck::Array& KRaws, + const ck::Array& AsStride +#else + const std::array& MRaws, + const std::array& KRaws, + const std::array& AsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -462,10 +469,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeBsGridDescriptor_N_K(const std::array& NRaws, - const std::array& KRaws, - const std::array& BsStride) + __host__ __device__ static auto MakeBsGridDescriptor_N_K( +#ifdef CK_CODE_GEN_RTC + const ck::Array& NRaws, + const ck::Array& KRaws, + const ck::Array& BsStride +#else + const std::array& NRaws, + const std::array& KRaws, + const std::array& BsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -500,10 +514,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeDsGridDescriptor_M_N(const std::array& MRaws, - const std::array& NRaws, - const std::array& DsStride) + __host__ __device__ static auto MakeDsGridDescriptor_M_N( +#ifdef CK_CODE_GEN_RTC + const ck::Array& MRaws, + const ck::Array& NRaws, + const ck::Array& DsStride +#else + const std::array& MRaws, + const std::array& NRaws, + const std::array& DsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -969,9 +990,15 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle const index_t M, const index_t N, const index_t K, +#ifdef CK_CODE_GEN_RTC + const ck::Array StrideAs, + const ck::Array StrideBs, + const ck::Array StrideDs, +#else const std::array StrideAs, const std::array StrideBs, const std::array StrideDs, +#endif const index_t StrideE, const Block2ETileMap& block_2_etile_map) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp index 4b344c02f8..eb1eb533d7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -473,11 +473,19 @@ struct GridwiseGemmMultipleD_xdl_cshuffle return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw); } +#ifdef CK_CODE_GEN_RTC + template + __host__ __device__ static auto + MakeDsGridDescriptor_M_N(const ck::Array& MRaws, + const ck::Array& NRaws, + const ck::Array& DsStride) +#else template __host__ __device__ static auto MakeDsGridDescriptor_M_N(const std::array& MRaws, const std::array& NRaws, const std::array& DsStride) +#endif { return generate_tuple( [&](auto i) { @@ -941,7 +949,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle const index_t K, const index_t StrideA, const index_t StrideB, +#ifdef CK_CODE_GEN_RTC + const ck::Array StrideDs, +#else const std::array StrideDs, +#endif const index_t StrideE, const Block2ETileMap& block_2_etile_map) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 44cbbcd049..9dad66913a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -1,10 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once - +#ifndef CK_CODE_GEN_RTC #include #include +#endif #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp" @@ -53,12 +54,15 @@ constexpr auto GridwiseGemmPipeline_Selector() } else { +#ifndef CK_CODE_GEN_RTC std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl; +#endif } } } // namespace ck +#ifndef CK_CODE_GEN_RTC inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) { switch(p) @@ -71,3 +75,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) } return os; } +#endif diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index bb1871ae62..21315c2567 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -212,7 +212,7 @@ template ::type = false> struct ThreadwiseTensorSliceTransfer_v2 { - static_assert((InvalidElementAsNaN && !std::is_integral::value) || + static_assert((InvalidElementAsNaN && !ck::is_integral::value) || (!InvalidElementAsNaN), "Filling invalid element as NaN is only for floating point types"); diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index b91b12ad52..3db94deccb 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -1,10 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -148,8 +147,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -201,11 +200,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I0]}, ZYX_{X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -219,8 +222,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -272,11 +275,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I1]}, ZYX_{Y_ * X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -290,8 +297,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -343,11 +350,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I2]}, ZYX_{Z_ * Y_ * X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -478,11 +489,11 @@ struct TransformConvFwdToGemm // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as // properties template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeADescriptor_M_K() const { if constexpr(ConvForwardSpecialization == @@ -691,11 +702,11 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeADescriptor_M_K() const { @@ -932,7 +943,7 @@ struct TransformConvFwdToGemm } template || is_same_v || is_same_v), @@ -1242,19 +1253,19 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v, - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v, + bool>::type = false> __host__ __device__ auto MakeBDescriptor_N_K() const { if constexpr(ConvForwardSpecialization == device::ConvolutionForwardSpecialization::Filter3x3) { using FilterSizeNumType = - std::conditional_t, - std::conditional_t, Number<27>>>; + ck::conditional_t, + ck::conditional_t, Number<27>>>; if constexpr(NumGroupsToMerge == 1) { @@ -1297,13 +1308,13 @@ struct TransformConvFwdToGemm template < typename BLayout, - typename std::enable_if || - is_same_v || - is_same_v || - is_same_v || - is_same_v || - is_same_v, - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v, + bool>::type = false> __host__ __device__ auto MakeBDescriptor_N_K() const { const auto wei_k_yx_c_desc = make_naive_tensor_descriptor( @@ -1318,36 +1329,36 @@ struct TransformConvFwdToGemm return wei_gemmn_gemmk_desc; } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_), make_tuple(I0, KStrideTensorC_)); } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), make_tuple(I0, KStrideTensorC_)); } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_), @@ -1355,12 +1366,12 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v), - bool>::type = false> + index_t NDimSp = NDimSpatial, + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { const IndexType NDoHoWo = N_ * Wo_; @@ -1410,11 +1421,11 @@ struct TransformConvFwdToGemm template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { const IndexType NDoHoWo = N_ * Ho_ * Wo_; @@ -1467,7 +1478,7 @@ struct TransformConvFwdToGemm template || is_same_v || is_same_v), diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index ad13c44311..534a01e083 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "data_type.hpp" @@ -1021,15 +1021,24 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; static_assert(bytes_per_thread == dword_bytes); +#ifndef CK_CODE_GEN_RTC const uint32_t* global_ptr = reinterpret_cast(reinterpret_cast(global_base_ptr)); +#else + const uint32_t* global_ptr = + reinterpret_cast(reinterpret_cast(global_base_ptr)); +#endif const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size); const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM T* lds_ptr = lds_base_ptr + lds_offset; +#ifndef CK_CODE_GEN_RTC auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); +#else + auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); +#endif asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), @@ -1038,8 +1047,13 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, #else // LDS pointer must be attributed with the LDS address space. __attribute__((address_space(3))) uint32_t* lds_ptr = +#ifndef CK_CODE_GEN_RTC reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( reinterpret_cast(lds_base_ptr + lds_offset)); +#else + reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( + reinterpret_cast(lds_base_ptr + lds_offset)); +#endif llvm_amdgcn_raw_buffer_load_lds( src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0); diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp index e9174904c9..b4838277f1 100644 --- a/include/ck/utility/amd_ck_fp8.hpp +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -1,8 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/ck.hpp" +#include "ck/utility/enable_if.hpp" #include "ck/utility/random_gen.hpp" #include "ck/utility/type.hpp" @@ -424,9 +426,9 @@ __host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a) } template || std::is_same_v || - std::is_same_v || std::is_same_v, - bool> = true> + ck::enable_if_t || is_same_v || + is_same_v || is_same_v, + bool> = true> __host__ __device__ static inline constexpr bool fp8_is_inf(T) { return false; @@ -823,7 +825,11 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) if constexpr(stochastic_rounding) { constexpr int seed = 1254739; - rng = prand_generator(reinterpret_cast(&f), f); +#ifndef CK_CODE_GEN_RTC + rng = prand_generator(reinterpret_cast(&f), f); +#else + rng = prand_generator(reinterpret_cast(&f), f); +#endif } return cast_to_f8_from_f32( f, rng); @@ -839,7 +845,11 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f) if constexpr(stochastic_rounding) { constexpr int seed = 1254739; +#ifndef CK_CODE_GEN_RTC rng = prand_generator(reinterpret_cast(&f), f); +#else + rng = prand_generator(reinterpret_cast(&f), f); +#endif } if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ) diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp index d6e1eab314..128c8e9a2c 100644 --- a/include/ck/utility/amd_wave_read_first_lane.hpp +++ b/include/ck/utility/amd_wave_read_first_lane.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -7,10 +7,12 @@ #include "ck/utility/functional2.hpp" #include "ck/utility/math.hpp" +#ifndef CK_CODE_GEN_RTC #include #include #include #include +#endif namespace ck { namespace detail { @@ -37,7 +39,7 @@ struct get_carrier<3> { using value_type = uint32_t; - std::array bytes; + Array bytes; static_assert(sizeof(bytes) <= sizeof(value_type)); // replacement of host std::copy_n() @@ -61,22 +63,22 @@ struct get_carrier<3> // method to trigger template substitution failure __device__ carrier(const carrier& other) noexcept { - copy_n(other.bytes.begin(), bytes.size(), bytes.begin()); + copy_n(other.bytes.begin(), bytes.Size(), bytes.begin()); } public: __device__ carrier& operator=(value_type value) noexcept { - copy_n(reinterpret_cast(&value), bytes.size(), bytes.begin()); + copy_n(reinterpret_cast(&value), bytes.Size(), bytes.begin()); return *this; } __device__ operator value_type() const noexcept { - std::byte result[sizeof(value_type)]; + ck::byte result[sizeof(value_type)]; - copy_n(bytes.begin(), bytes.size(), result); + copy_n(bytes.begin(), bytes.Size(), result); return *reinterpret_cast(result); } @@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) { constexpr unsigned object_size = sizeof(int64_t); constexpr unsigned second_part_offset = object_size / 2; - auto* const from_obj = reinterpret_cast(&value); - alignas(int64_t) std::byte to_obj[object_size]; + auto* const from_obj = reinterpret_cast(&value); + alignas(int64_t) ck::byte to_obj[object_size]; using Sgpr = uint32_t; @@ -122,17 +124,16 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) return *reinterpret_cast(to_obj); } -template < - typename Object, - typename = std::enable_if_t && std::is_trivially_copyable_v>> +template && ck::is_trivially_copyable_v>> __device__ auto amd_wave_read_first_lane(const Object& obj) { using Size = unsigned; constexpr Size SgprSize = 4; constexpr Size ObjectSize = sizeof(Object); - auto* const from_obj = reinterpret_cast(&obj); - alignas(Object) std::byte to_obj[ObjectSize]; + auto* const from_obj = reinterpret_cast(&obj); + alignas(Object) ck::byte to_obj[ObjectSize]; constexpr Size RemainedSize = ObjectSize % SgprSize; constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp index 5366c56a9d..2afad00d49 100644 --- a/include/ck/utility/array.hpp +++ b/include/ck/utility/array.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_ARRAY_HPP #define CK_ARRAY_HPP @@ -38,6 +38,8 @@ struct Array } __host__ __device__ constexpr const TData* begin() const { return &mData[0]; } __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; } + __host__ __device__ constexpr TData* begin() { return &mData[0]; } + __host__ __device__ constexpr TData* end() { return &mData[NSize]; } }; // empty Array @@ -54,7 +56,7 @@ template __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs) { using data_type = remove_cvref_t; - return Array{std::forward(x), std::forward(xs)...}; + return Array{ck::forward(x), ck::forward(xs)...}; } // make empty array diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp index 9c7b954565..bd0ca42ecd 100644 --- a/include/ck/utility/container_helper.hpp +++ b/include/ck/utility/container_helper.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_CONTAINER_HELPER_HPP #define CK_CONTAINER_HELPER_HPP @@ -326,14 +326,14 @@ template __host__ __device__ constexpr auto container_concat(const Array& ax, const Array& ay) { return unpack2( - [&](auto&&... zs) { return make_array(std::forward(zs)...); }, ax, ay); + [&](auto&&... zs) { return make_array(ck::forward(zs)...); }, ax, ay); } template __host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) { return unpack2( - [&](auto&&... zs) { return make_tuple(std::forward(zs)...); }, tx, ty); + [&](auto&&... zs) { return make_tuple(ck::forward(zs)...); }, tx, ty); } template diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index d9c954c50f..882d661331 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -5,9 +5,21 @@ #include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/statically_indexed_array.hpp" - +#ifdef CK_CODE_GEN_RTC +using int8_t = signed char; +using uint8_t = unsigned char; +using int16_t = signed short; +using uint16_t = unsigned short; +using float_t = float; +#endif namespace ck { +#ifdef CK_CODE_GEN_RTC +using byte = unsigned char; +#else +using std::byte; +#endif + using bhalf_t = ushort; using half_t = _Float16; using int4_t = _BitInt(4); @@ -217,7 +229,7 @@ struct scalar_type }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using type = d1_t; @@ -253,7 +265,7 @@ struct vector_type()>> __device__ int static err = 0; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -313,7 +325,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -383,7 +395,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -453,7 +465,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d4_t __attribute__((ext_vector_type(4))); @@ -523,7 +535,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -605,7 +617,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -687,7 +699,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d4_t __attribute__((ext_vector_type(4))); @@ -769,7 +781,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -863,7 +875,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -967,7 +979,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1083,7 +1095,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1209,7 +1221,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1374,7 +1386,7 @@ template struct non_native_vector_base< T, N, - std::enable_if_t> + ck::enable_if_t> { using data_t = typename nnvb_data_t_selector::type; // select data_t based on the size of T static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch"); @@ -1499,7 +1511,7 @@ struct scalar_type> // non-native vector_type implementation template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1550,7 +1562,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1613,7 +1625,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1686,7 +1698,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1771,7 +1783,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1866,7 +1878,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d2_t = non_native_vector_base; @@ -1970,7 +1982,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d2_t = non_native_vector_base; @@ -2210,20 +2222,230 @@ using pk_i4x2_t = typename vector_type::type; using pk_i4x4_t = typename vector_type::type; using pk_i4x8_t = typename vector_type::type; +#ifdef CK_CODE_GEN_RTC +template +struct NumericLimits; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; } + + __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; } + + __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; } + + __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int32_t QuietNaN() { return 0; } +}; +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; } + + __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; } + + __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; } + + __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int16_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; } + + __host__ __device__ static constexpr int8_t Min() noexcept { return -128; } + + __host__ __device__ static constexpr int8_t Max() noexcept { return 127; } + + __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int8_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; } + + __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; } + + __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + static constexpr unsigned int binary_min = 0x00800000; + static constexpr unsigned int binary_max = 0x7F7FFFFF; + static constexpr unsigned int binary_lowest = 0xFF7FFFFF; + static constexpr unsigned int binary_qnan = 0xFFC00001; + static constexpr unsigned int binary_inf = 0x7F8000000; + + __host__ __device__ static constexpr float Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr float Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr float Lowest() { return bit_cast(binary_lowest); } + + __host__ __device__ static constexpr float QuietNaN() { return bit_cast(binary_qnan); } + + __host__ __device__ static constexpr float Infinity() { return bit_cast(binary_inf); } +}; + +template <> +struct NumericLimits +{ + static constexpr unsigned short binary_min = 0x0400; + static constexpr unsigned short binary_max = 0x7BFF; + static constexpr unsigned short binary_lowest = 0xFBFF; + static constexpr unsigned short binary_qnan = 0x7FFF; + + __host__ __device__ static constexpr half_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr half_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr half_t Lowest() { return bit_cast(binary_lowest); } + + __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast(binary_qnan); } +}; + +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); } + + __host__ __device__ static constexpr int4_t Max() { return int4_t(7); } + + __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); } +}; +#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + +template <> +struct NumericLimits +{ + // negative zero nan mode with exp bias = 8 + static constexpr uint8_t binary_min = 0x08; // 0b00001000 + static constexpr uint8_t binary_max = 0x7F; // 0b01111111 + static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111 + static constexpr uint8_t binary_qnan = 0x80; // 0b10000000 + // ieee mode with exp bias = 7 + // static constexpr uint8_t binary_min = 0x08; // 0b00001000 + // static constexpr uint8_t binary_max = 0x77; // 0b01110111 + // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111 + // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0 + + __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); } + + __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); } + + __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); } + + __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + // negative zero nan mode with exp bias = 16 + static constexpr uint8_t binary_min = 0x04; // 0b00000100 + static constexpr uint8_t binary_max = 0x7F; // 0b01111111 + static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111 + static constexpr uint8_t binary_qnan = 0x80; // 0b10000000 + // ieee mode with exp bias = 15 + // static constexpr uint8_t binary_min = 0x04; // 0b00000100 + // static constexpr uint8_t binary_max = 0x7B; // 0b01111011 + // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 + // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!= + + __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); } + + __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); } + + __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); } + + __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6 + static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448 + static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448 + static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111 + + __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr f8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr f8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14 + static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344 + static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344 + static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101 + + __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr bf8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr bf8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; +#else template struct NumericLimits { __host__ __device__ static constexpr T Min() { return std::numeric_limits::min(); } - __host__ __device__ static constexpr T Max() { return std::numeric_limits::max(); } - __host__ __device__ static constexpr T Lowest() { return std::numeric_limits::lowest(); } - __host__ __device__ static constexpr T QuietNaN() { return std::numeric_limits::quiet_NaN(); } - __host__ __device__ static constexpr T Infinity() { return std::numeric_limits::infinity(); } }; @@ -2347,6 +2569,7 @@ struct NumericLimits return bit_cast(binary_qnan); } }; +#endif template struct NumericUtils diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp index 03c4e16dd6..2b247cc02a 100644 --- a/include/ck/utility/debug.hpp +++ b/include/ck/utility/debug.hpp @@ -1,8 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef UTILITY_DEBUG_HPP #define UTILITY_DEBUG_HPP +#include "type.hpp" namespace ck { namespace debug { diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp index c0a3c99f1f..6ba63fc761 100644 --- a/include/ck/utility/enable_if.hpp +++ b/include/ck/utility/enable_if.hpp @@ -1,14 +1,31 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once namespace ck { +#ifndef CK_CODE_GEN_RTC template using enable_if = std::enable_if; template using enable_if_t = typename std::enable_if::type; +#else +template +struct enable_if +{ +}; + +template +struct enable_if +{ + using type = T; +}; + +template +using enable_if_t = typename enable_if::type; +#endif + } // namespace ck diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp index 6455402dcb..809f302f74 100644 --- a/include/ck/utility/env.hpp +++ b/include/ck/utility/env.hpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +#ifndef CK_CODE_GEN_RTC #pragma once #include @@ -183,3 +184,4 @@ void UpdateEnvVar(EnvVar, const std::string_view& val) } } // namespace ck +#endif diff --git a/include/ck/utility/functional.hpp b/include/ck/utility/functional.hpp index 91797d2409..cd48ed1747 100644 --- a/include/ck/utility/functional.hpp +++ b/include/ck/utility/functional.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y) { if constexpr(predicate) { - return std::forward(x); + return ck::forward(x); } else { - return std::forward(y); + return ck::forward(y); } } diff --git a/include/ck/utility/functional4.hpp b/include/ck/utility/functional4.hpp index b5f3df8d7c..8e86a296dc 100644 --- a/include/ck/utility/functional4.hpp +++ b/include/ck/utility/functional4.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_FUNCTIONAL4_HPP #define CK_FUNCTIONAL4_HPP @@ -21,7 +21,7 @@ struct unpack_impl> template __host__ __device__ constexpr auto operator()(F&& f, X&& x) const { - return std::forward(f)(std::forward(x).At(Number{})...); + return ck::forward(f)(ck::forward(x).At(Number{})...); } }; @@ -35,8 +35,8 @@ struct unpack2_impl, Sequence> template __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const { - return std::forward(f)(std::forward(x).At(Number{})..., - std::forward(y).At(Number{})...); + return ck::forward(f)(ck::forward(x).At(Number{})..., + ck::forward(y).At(Number{})...); } }; @@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x) { using X_ = remove_reference_t; return detail::unpack_impl::type>{}( - std::forward(f), std::forward(x)); + ck::forward(f), ck::forward(x)); } // TODO: properly implement unpack that takes any number of containers @@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y) using Y_ = remove_reference_t; return detail::unpack2_impl::type, typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}( - std::forward(f), std::forward(x), std::forward(y)); + ck::forward(f), ck::forward(x), ck::forward(y)); } } // namespace ck diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp index 376070eb3d..75f35d762c 100644 --- a/include/ck/utility/integral_constant.hpp +++ b/include/ck/utility/integral_constant.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -48,4 +48,9 @@ __host__ __device__ constexpr auto operator%(integral_constant, integral_ return integral_constant{}; } +template +using bool_constant = integral_constant; + +using true_type = bool_constant; +using false_type = bool_constant; } // namespace ck diff --git a/include/ck/utility/is_detected.hpp b/include/ck/utility/is_detected.hpp index 7a324a6c45..a700fcfff1 100644 --- a/include/ck/utility/is_detected.hpp +++ b/include/ck/utility/is_detected.hpp @@ -1,22 +1,24 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/utility/integral_constant.hpp" + namespace ck { namespace detail { template class Op, class... Args> struct detector { - using value_t = std::false_type; + using value_t = integral_constant; using type = Default; }; template class Op, class... Args> -struct detector>, Op, Args...> +struct detector>, Op, Args...> { - using value_t = std::true_type; + using value_t = integral_constant; using type = Op; }; } // namespace detail @@ -32,12 +34,12 @@ template