diff --git a/CMakeLists.txt b/CMakeLists.txt index 86ad9d39d8..20365a6130 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ if(DPP_KERNELS) endif() option(CK_USE_CODEGEN "Enable codegen library" OFF) if(CK_USE_CODEGEN) - add_definitions(-DCK_USE_CODEGEN) + add_definitions(-DCK_USE_CODEGEN) endif() option(CK_TIME_KERNEL "Enable kernel time tracking" ON) diff --git a/codegen/driver/main.cpp b/codegen/driver/main.cpp index c7d295de94..7b878d0d57 100644 --- a/codegen/driver/main.cpp +++ b/codegen/driver/main.cpp @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/codegen/src/headers.cpp b/codegen/src/headers.cpp index 5b0c929db3..452cd99846 100644 --- a/codegen/src/headers.cpp +++ b/codegen/src/headers.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/headers.hpp" #include "ck_headers.hpp" diff --git a/codegen/src/types.cpp b/codegen/src/types.cpp index a8a8b10c04..9aa5d39fae 100644 --- a/codegen/src/types.cpp +++ b/codegen/src/types.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/types.hpp" #include "ck/host/stringutils.hpp" #include diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp index bd7ef463fb..9e2d990d9b 100644 --- a/codegen/test/gemm_multiple_d.cpp +++ b/codegen/test/gemm_multiple_d.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_gemm_multiple_d/problem.hpp" #include "ck/host/device_gemm_multiple_d/operation.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp index 50290fa25a..9902caab04 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp index b558d97c78..205283e7aa 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp index e2972a93d2..2b83af2432 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp index b728096c51..fbe27e9c8b 100644 --- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp +++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" #include "ck/host/headers.hpp" diff --git a/codegen/test/include/common.hpp b/codegen/test/include/common.hpp index 99d4c64973..24fde2e523 100644 --- a/codegen/test/include/common.hpp +++ b/codegen/test/include/common.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #pragma once #include #include diff --git a/codegen/test/rtc/include/rtc/compile_kernel.hpp b/codegen/test/rtc/include/rtc/compile_kernel.hpp index c4413b47be..a49714f7c6 100644 --- a/codegen/test/rtc/include/rtc/compile_kernel.hpp +++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL diff --git a/codegen/test/rtc/include/rtc/hip.hpp b/codegen/test/rtc/include/rtc/hip.hpp index e962d4cd3e..af2f4a9122 100644 --- a/codegen/test/rtc/include/rtc/hip.hpp +++ b/codegen/test/rtc/include/rtc/hip.hpp @@ -1,10 +1,13 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP #include #include -#include #include +#include namespace rtc { diff --git a/codegen/test/rtc/include/rtc/kernel.hpp b/codegen/test/rtc/include/rtc/kernel.hpp index 9f38e90416..b1ee729f77 100644 --- a/codegen/test/rtc/include/rtc/kernel.hpp +++ b/codegen/test/rtc/include/rtc/kernel.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL diff --git a/codegen/test/rtc/include/rtc/manage_ptr.hpp b/codegen/test/rtc/include/rtc/manage_ptr.hpp index 92edf12628..52b94d4b70 100644 --- a/codegen/test/rtc/include/rtc/manage_ptr.hpp +++ b/codegen/test/rtc/include/rtc/manage_ptr.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER diff --git a/codegen/test/rtc/include/rtc/tmp_dir.hpp b/codegen/test/rtc/include/rtc/tmp_dir.hpp index a0a2cb9b77..2f3b26cc43 100644 --- a/codegen/test/rtc/include/rtc/tmp_dir.hpp +++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp index 8cb71b9043..5a70f898e8 100644 --- a/codegen/test/rtc/src/compile_kernel.cpp +++ b/codegen/test/rtc/src/compile_kernel.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/codegen/test/rtc/src/hip.cpp b/codegen/test/rtc/src/hip.cpp index 747f83e3ba..6f16e36720 100644 --- a/codegen/test/rtc/src/hip.cpp +++ b/codegen/test/rtc/src/hip.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/codegen/test/rtc/src/kernel.cpp b/codegen/test/rtc/src/kernel.cpp index 9fe38e84ad..982e95de17 100644 --- a/codegen/test/rtc/src/kernel.cpp +++ b/codegen/test/rtc/src/kernel.cpp @@ -1,6 +1,10 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include +#include #include // extern declare the function since hip/hip_ext.h header is broken diff --git a/codegen/test/rtc/src/tmp_dir.cpp b/codegen/test/rtc/src/tmp_dir.cpp index 4e89bc3539..b36b17cce1 100644 --- a/codegen/test/rtc/src/tmp_dir.cpp +++ b/codegen/test/rtc/src/tmp_dir.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include #include #include diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index fc9d074716..fa49f6ddd5 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -5,7 +5,7 @@ #include "ck/config.h" #include "ck/utility/env.hpp" - +#ifndef CK_CODE_GEN_RTC #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" @@ -14,7 +14,7 @@ // environment variable to enable logging: // export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) - +#endif // to do: add various levels of logging with CK_LOG_LEVEL #ifndef CK_TIME_KERNEL diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp index 1c4de5ed31..0a0bcbac38 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -131,7 +131,7 @@ struct ThreadGroupTensorSliceTransfer_v7r2 } template - using is_tuple = decltype(std::declval().IsTuple()); + using is_tuple = decltype(ck::declval().IsTuple()); template __device__ void RunWrite(const DstDescs& dst_descs, diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp index 0eef827a5b..cf20025d46 100644 --- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp +++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp @@ -1,9 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include +#endif namespace ck { namespace tensor_operation { @@ -18,6 +20,7 @@ enum struct ConvolutionForwardSpecialization Filter3x3, }; +#ifndef CK_CODE_GEN_RTC inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s) { switch(s) @@ -30,6 +33,7 @@ inline std::string getConvForwardSpecializationString(const ConvolutionForwardSp default: return "Unrecognized specialization!"; } } +#endif } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index 736e241fdf..774982d905 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -1,19 +1,21 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include #include #include #include - #include "ck/stream_config.hpp" +#endif namespace ck { namespace tensor_operation { namespace device { +#ifndef CK_CODE_GEN_RTC #define GET_OBJECT_NAME_IMLP \ std::optional GetObjectName() const override \ { \ @@ -41,7 +43,9 @@ namespace device { } #define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL +#endif +#ifndef CK_CODE_GEN_RTC struct BaseArgument { BaseArgument() = default; @@ -66,13 +70,14 @@ struct BaseInvoker virtual ~BaseInvoker() {} }; +#endif struct BaseOperator { BaseOperator() = default; BaseOperator(const BaseOperator&) = default; BaseOperator& operator=(const BaseOperator&) = default; - +#ifndef CK_CODE_GEN_RTC virtual bool IsSupportedArgument(const BaseArgument*) { return false; } virtual std::string GetTypeString() const { return ""; } @@ -100,7 +105,7 @@ struct BaseOperator assert(p_arg); p_arg->p_workspace_ = p_workspace; } - +#endif virtual ~BaseOperator() {} }; diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp index 184efbbd68..8c9b768a8b 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp @@ -1,9 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#ifndef CK_CODE_GEN_RTC #include +#endif #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" @@ -13,8 +15,13 @@ namespace ck { namespace tensor_operation { namespace device { +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif /** * \brief Grouped Convolution Forward @@ -72,12 +79,18 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator static constexpr index_t NumDTensor = DsDataType::Size(); static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor"); - +#ifdef CK_CODE_GEN_RTC + using APointers = ck::conditional_t&, const void*>; + using BPointers = ck::conditional_t&, const void*>; +#else // If DataType is tuple, user has to pass std::array with pointers. using APointers = - std::conditional_t&, const void*>; + ck::conditional_t&, const void*>; using BPointers = - std::conditional_t&, const void*>; + ck::conditional_t&, const void*>; +#endif + +#ifndef CK_CODE_GEN_RTC /** * \brief Make argument pointer for grouped conv fwd. @@ -150,6 +163,7 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator const CDEElementwiseOperation& cde_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; +#endif }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp index 0bb45b18c3..997dcb75a6 100644 --- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp +++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -29,6 +29,7 @@ enum struct GemmSpecialization MNKOPadding, }; +#ifndef CK_CODE_GEN_RTC inline std::string getGemmSpecializationString(const GemmSpecialization& s) { switch(s) @@ -52,6 +53,7 @@ inline std::string getGemmSpecializationString(const GemmSpecialization& s) default: return "Unrecognized specialization!"; } } +#endif } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 180e32c8b6..d9c4e22049 100644 --- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -3,11 +3,17 @@ #pragma once +#ifndef CK_CODE_GEN_RTC #include #include #include #include #include +#include + +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#endif #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" @@ -15,15 +21,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/io.hpp" namespace ck { namespace tensor_operation { @@ -259,8 +262,13 @@ __global__ void } // namespace +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif // // @brief Device Convolution operation. @@ -429,8 +437,8 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // If we are using multiAB and one of the template datatype parameters is not a tuple, convert // it to it - using GemmADataType = std::conditional_t, ADataType>; - using GemmBDataType = std::conditional_t, BDataType>; + using GemmADataType = ck::conditional_t, ADataType>; + using GemmBDataType = ck::conditional_t, BDataType>; #define GridwiseGemmTemplateParameters \ GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ @@ -449,15 +457,13 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle CDEBlockTransferScalarPerVector_NPerBlock, LoopSched // Use appropriate gridwise gemm using GridwiseGemm = - std::conditional_t, - GridwiseGemmMultipleD_xdl_cshuffle>; + ck::conditional_t, + GridwiseGemmMultipleD_xdl_cshuffle>; // If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers. - using APointers = - std::conditional_t&, const void*>; - using BPointers = - std::conditional_t&, const void*>; + using APointers = ck::conditional_t&, const void*>; + using BPointers = ck::conditional_t&, const void*>; // Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not // in initializer list what is required for single const pointer). using AGridPointer = remove_cvref_t< @@ -812,7 +818,6 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle static_for<0, NumDTensor, 1>{}([&](auto i) { using DLayout = remove_cvref_t>; - // FIXME: layout if constexpr(is_same_v || is_same_v || is_same_v || is_same_v || @@ -965,18 +970,18 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle const BElementwiseOperation& b_element_op, const CDEElementwiseOperation& cde_element_op) { - std::array a_g_n_c_wis_lengths_i32; - std::array a_g_n_c_wis_strides_i32; - std::array b_g_k_c_xs_lengths_i32; - std::array b_g_k_c_xs_strides_i32; - std::array, NumDTensor> ds_g_n_k_wos_lengths_i32; - std::array, NumDTensor> ds_g_n_k_wos_strides_i32; - std::array e_g_n_k_wos_lengths_i32; - std::array e_g_n_k_wos_strides_i32; - std::array conv_filter_strides_i32; - std::array conv_filter_dilations_i32; - std::array input_left_pads_i32; - std::array input_right_pads_i32; + ck::Array a_g_n_c_wis_lengths_i32; + ck::Array a_g_n_c_wis_strides_i32; + ck::Array b_g_k_c_xs_lengths_i32; + ck::Array b_g_k_c_xs_strides_i32; + ck::Array, NumDTensor> ds_g_n_k_wos_lengths_i32; + ck::Array, NumDTensor> ds_g_n_k_wos_strides_i32; + ck::Array e_g_n_k_wos_lengths_i32; + ck::Array e_g_n_k_wos_strides_i32; + ck::Array conv_filter_strides_i32; + ck::Array conv_filter_dilations_i32; + ck::Array input_left_pads_i32; + ck::Array input_right_pads_i32; array_convert(a_g_n_c_wis_lengths_i32, a_g_n_c_wis_lengths); array_convert(a_g_n_c_wis_strides_i32, a_g_n_c_wis_strides); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp index e4203e0313..9482812f75 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/library/utility/numeric.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp index 7f28ec7680..2666051c86 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp @@ -205,8 +205,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK #include +#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -212,9 +213,13 @@ __global__ void } } // namespace - +#ifdef CK_CODE_GEN_RTC +template +using is_tuple = decltype(ck::declval().IsTuple()); +#else template using is_tuple = decltype(std::declval().IsTuple()); +#endif // // @brief Device Convolution operation. diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp index 589a0daa99..85d1ba8f48 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp @@ -9,6 +9,7 @@ #include #include +#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp index 648736fcbf..1ad37058db 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp @@ -3,6 +3,7 @@ #pragma once +#include "ck/library/utility/numeric.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index 2202bc5695..85adb64b43 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout } // namespace convolution +#ifndef CK_CODE_GEN_RTC template < typename Layout, typename std::enable_if::value, bool>::type = false> @@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&) os << Layout::name; return os; } +#endif } // namespace tensor_layout } // namespace ck diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index c87c90a91d..530876650e 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -340,8 +340,8 @@ struct Bilinear }; template <> - __host__ __device__ constexpr void operator()( - std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const + __host__ __device__ constexpr void + operator()(int8_t& y, const int32_t& x0, const int8_t& x1) const { y = type_convert(alpha_ * type_convert(x0) + beta_ * type_convert(x1)); diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index b914c0b96f..370d03258d 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -533,7 +533,7 @@ struct NormalizeInInfer const T3& gamma, const T4& beta) const { - static_assert(std::is_same::value || std::is_same::value, + static_assert(is_same::value || is_same::value, "Data type is not supported by this operation!"); using ck::type_convert; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 5e522fb2ea..139f0057e4 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -252,7 +252,7 @@ struct PassThroughPack2 template __host__ __device__ void operator()(Y& y, const X& x) const; - __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::f8x2_t& x) const + __host__ __device__ constexpr void operator()(half2_t& y, const f8x2_t& x) const { auto t = type_convert(x); y = type_convert(t); @@ -479,7 +479,7 @@ struct PassThrough template <> __host__ __device__ void operator()(bf8_t& y, const half_t& x) const { - y = ck::type_convert(x); + y = type_convert(x); } }; @@ -552,21 +552,21 @@ struct Scale template __host__ __device__ void operator()(Y& y, const X& x) const { - y = ck::type_convert(ck::type_convert(x) * scale_); + y = type_convert(type_convert(x) * scale_); } template <> __host__ __device__ void operator()(half_t& y, const half_t& x) const { - y = ck::type_convert(scale_) * x; + y = type_convert(scale_) * x; }; template <> __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { - const float x_tmp = ck::type_convert(x); + const float x_tmp = type_convert(x); const float y_tmp = scale_ * x_tmp; - y = ck::type_convert(y_tmp); + y = type_convert(y_tmp); }; template <> @@ -584,7 +584,7 @@ struct Scale template <> __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { - y = ck::type_convert(scale_ * ck::type_convert(x)); + y = type_convert(scale_ * type_convert(x)); }; float scale_; @@ -600,7 +600,7 @@ struct ScaleAndResetNaNToMinusInfinity template <> __host__ __device__ void operator()(float& y, const float& x) const { - y = ck::math::isnan(x) ? -ck::NumericLimits::Infinity() : scale_ * x; + y = math::isnan(x) ? -NumericLimits::Infinity() : scale_ * x; }; float scale_; @@ -671,12 +671,13 @@ struct UnaryAbs template __host__ __device__ void operator()(T& y, const T& x) const { + static_assert(is_same::value || is_same::value || is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::abs(x); + y = math::abs(x); }; template <> @@ -694,7 +695,7 @@ struct UnarySqrt static_assert(is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sqrt(x); + y = math::sqrt(x); }; }; @@ -713,9 +714,9 @@ struct Relu template <> __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { - float x_f32 = ck::type_convert(x); + float x_f32 = type_convert(x); float y_f32 = x_f32 > 0 ? x_f32 : 0; - y = ck::type_convert(y_f32); + y = type_convert(y_f32); } }; @@ -731,7 +732,7 @@ struct FastGelu template __device__ void operator()(Y& y, const X& x) const; - +#ifndef CK_CODE_GEN_RTC template <> __host__ void operator()(float& y, const float& x) const { @@ -742,6 +743,7 @@ struct FastGelu const float emu = exp(u); y = x / (1.f + emu); } +#endif // device code, use lower precision "__ocml_exp_f32" and "rcp" template <> @@ -753,7 +755,7 @@ struct FastGelu const float u = x * (c1 * x * x + c2); const float emu = __ocml_exp_f32(u); - y = x * ck::math::rcp(1.f + emu); + y = x * math::rcp(1.f + emu); } template <> @@ -851,10 +853,9 @@ struct Gelu } template <> - __host__ __device__ void operator()(ck::half_t& y, - const ck::half_t& x) const + __host__ __device__ void operator()(half_t& y, const half_t& x) const { - y = ck::half_t(0.5) * x * (ck::half_t(1) + ck::half_t(erf(float(0.70710678118f * x)))); + y = half_t(0.5) * x * (half_t(1) + half_t(erf(float(0.70710678118f * x)))); } }; @@ -868,7 +869,7 @@ struct Sigmoid is_same::value, "Data type is not supported by this operation!"); constexpr T one = type_convert(1); - y = one / (one + ck::math::exp(-x)); + y = one / (one + math::exp(-x)); }; }; @@ -877,11 +878,11 @@ struct Silu template __host__ __device__ void operator()(T& y, const T& x) const { - static_assert(is_same_v || is_same_v || is_same_v || + static_assert(is_same_v || is_same_v || is_same_v || is_same_v || is_same_v, "Data type is not supported by this operation!"); constexpr T one = type_convert(1); - y = x * (one / (one + ck::math::exp(-x))); + y = x * (one / (one + math::exp(-x))); }; }; @@ -895,7 +896,7 @@ struct TanH is_same::value, "Data type is not supported by this operation!"); - y = ck::math::tanh(x); + y = math::tanh(x); }; }; @@ -905,11 +906,11 @@ struct ACos __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::acos(x); + y = math::acos(x); }; }; @@ -919,11 +920,11 @@ struct Neg __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::neg(x); + y = math::neg(x); }; }; @@ -933,11 +934,11 @@ struct ATan __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::atan(x); + y = math::atan(x); }; }; @@ -947,11 +948,11 @@ struct Sin __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sin(x); + y = math::sin(x); }; }; @@ -961,11 +962,11 @@ struct ASinH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::asinh(x); + y = math::asinh(x); }; }; @@ -975,11 +976,11 @@ struct Cos __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::cos(x); + y = cos(x); }; }; @@ -989,11 +990,11 @@ struct ACosH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::acosh(x); + y = math::acosh(x); }; }; @@ -1003,11 +1004,11 @@ struct Tan __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::tan(x); + y = math::tan(x); }; }; @@ -1017,11 +1018,11 @@ struct ATanH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::atanh(x); + y = math::atanh(x); }; }; @@ -1031,11 +1032,11 @@ struct SinH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::sinh(x); + y = math::sinh(x); }; }; @@ -1045,11 +1046,11 @@ struct Ceil __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::ceil(x); + y = math::ceil(x); }; }; @@ -1059,11 +1060,11 @@ struct Exp __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::exp(x); + y = math::exp(x); }; }; @@ -1073,11 +1074,11 @@ struct CosH __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::cosh(x); + y = math::cosh(x); }; }; @@ -1087,11 +1088,11 @@ struct Floor __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::floor(x); + y = math::floor(x); }; }; @@ -1101,11 +1102,11 @@ struct Log __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::log(x); + y = math::log(x); }; }; @@ -1115,11 +1116,11 @@ struct ASin __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::asin(x); + y = math::asin(x); }; }; @@ -1129,11 +1130,11 @@ struct Rcp __host__ __device__ void operator()(T& y, const T& x) const { static_assert(is_same::value || is_same::value || - is_same::value || is_same::value || + is_same::value || is_same::value || is_same::value, "Data type is not supported by this operation!"); - y = ck::math::rcp(x); + y = math::rcp(x); }; }; @@ -1153,7 +1154,7 @@ struct Swish "Data type is not supported by this operation!"); float bx = -beta_ * type_convert(x); - y = type_convert(x / (1.f + ck::math::exp(bx))); + y = type_convert(x / (1.f + math::exp(bx))); }; const float beta_; @@ -1172,7 +1173,7 @@ struct SoftRelu "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); constexpr T one = type_convert(1); - y = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha; + y = math::log(one + math::exp(x * casted_alpha)) / casted_alpha; } const float alpha_; }; @@ -1193,7 +1194,7 @@ struct Power T casted_beta = type_convert(beta_); T casted_gamma = type_convert(gamma_); T shifted_scaled_x = casted_alpha + casted_beta * x; - y = ck::math::pow(shifted_scaled_x, casted_gamma); + y = math::pow(shifted_scaled_x, casted_gamma); } const float alpha_; const float beta_; @@ -1213,7 +1214,7 @@ struct ClippedRelu "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); T casted_beta = type_convert(beta_); - y = ck::math::min(casted_beta, ck::math::max(casted_alpha, x)); + y = math::min(casted_beta, math::max(casted_alpha, x)); } const float alpha_; const float beta_; @@ -1248,7 +1249,7 @@ struct Elu is_same::value, "Data type is not supported by this operation!"); T casted_alpha = type_convert(alpha_); - y = x > 0 ? x : casted_alpha * ck::math::expm1(x); + y = x > 0 ? x : casted_alpha * math::expm1(x); } const float alpha_; }; @@ -1350,10 +1351,10 @@ struct FastNumericArrayConverter }; template <> -struct FastNumericArrayConverter +struct FastNumericArrayConverter { using InputArray = vector_type; - using OutputArray = vector_type; + using OutputArray = vector_type; __device__ static OutputArray convert(InputArray const& Input) { @@ -1383,13 +1384,13 @@ struct FastNumericArrayConverter }; template -struct FastNumericArrayConverter +struct FastNumericArrayConverter { static constexpr int VEC_WIDTH = 4; static_assert(!(N % VEC_WIDTH), "N must be multiple of 4."); using InputArray = vector_type; - using OutputArray = vector_type; + using OutputArray = vector_type; __device__ static OutputArray convert(InputArray const& Input) { @@ -1398,7 +1399,7 @@ struct FastNumericArrayConverter OutputArray Output; using Vec_InputArray = vector_type; - using Vec_OutputArray = vector_type; + using Vec_OutputArray = vector_type; Vec_OutputArray* half_4_ptr = reinterpret_cast(&Output); Vec_InputArray const* uint8_4_ptr = reinterpret_cast(&Input); diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp index 56c37b1b72..2bc9ef87ac 100644 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp @@ -1,14 +1,17 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck/utility/math.hpp" #include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" #include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp" +#ifndef CK_CODE_GEN_RTC #include #include +#endif namespace ck { @@ -978,8 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit // Create 3D grid const auto M0 = math::integer_divide_ceil(M, MPerBlock); const auto N0 = math::integer_divide_ceil(N, NPerBlock); - - return std::make_tuple(N0, M0, k_split); + return make_tuple(N0, M0, k_split); } template @@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK uint32_t dp_for_sk_iters = k_iters_per_tile.get(); uint32_t best_sk_score = - std::numeric_limits::max(); // we need to find the smallest sk iters + NumericLimits::Max(); // we need to find the smallest sk iters for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles; tentative_sk_blocks++) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp index 150dd98064..344656b13f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -423,10 +423,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeAsGridDescriptor_M_K(const std::array& MRaws, - const std::array& KRaws, - const std::array& AsStride) + __host__ __device__ static auto MakeAsGridDescriptor_M_K( +#ifdef CK_CODE_GEN_RTC + const ck::Array& MRaws, + const ck::Array& KRaws, + const ck::Array& AsStride +#else + const std::array& MRaws, + const std::array& KRaws, + const std::array& AsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -462,10 +469,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeBsGridDescriptor_N_K(const std::array& NRaws, - const std::array& KRaws, - const std::array& BsStride) + __host__ __device__ static auto MakeBsGridDescriptor_N_K( +#ifdef CK_CODE_GEN_RTC + const ck::Array& NRaws, + const ck::Array& KRaws, + const ck::Array& BsStride +#else + const std::array& NRaws, + const std::array& KRaws, + const std::array& BsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -500,10 +514,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle } template - __host__ __device__ static auto - MakeDsGridDescriptor_M_N(const std::array& MRaws, - const std::array& NRaws, - const std::array& DsStride) + __host__ __device__ static auto MakeDsGridDescriptor_M_N( +#ifdef CK_CODE_GEN_RTC + const ck::Array& MRaws, + const ck::Array& NRaws, + const ck::Array& DsStride +#else + const std::array& MRaws, + const std::array& NRaws, + const std::array& DsStride +#endif + ) { return generate_tuple( [&](auto i) { @@ -969,9 +990,15 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle const index_t M, const index_t N, const index_t K, +#ifdef CK_CODE_GEN_RTC + const ck::Array StrideAs, + const ck::Array StrideBs, + const ck::Array StrideDs, +#else const std::array StrideAs, const std::array StrideBs, const std::array StrideDs, +#endif const index_t StrideE, const Block2ETileMap& block_2_etile_map) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp index 4b344c02f8..eb1eb533d7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -473,11 +473,19 @@ struct GridwiseGemmMultipleD_xdl_cshuffle return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw); } +#ifdef CK_CODE_GEN_RTC + template + __host__ __device__ static auto + MakeDsGridDescriptor_M_N(const ck::Array& MRaws, + const ck::Array& NRaws, + const ck::Array& DsStride) +#else template __host__ __device__ static auto MakeDsGridDescriptor_M_N(const std::array& MRaws, const std::array& NRaws, const std::array& DsStride) +#endif { return generate_tuple( [&](auto i) { @@ -941,7 +949,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle const index_t K, const index_t StrideA, const index_t StrideB, +#ifdef CK_CODE_GEN_RTC + const ck::Array StrideDs, +#else const std::array StrideDs, +#endif const index_t StrideE, const Block2ETileMap& block_2_etile_map) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 44cbbcd049..9dad66913a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -1,10 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once - +#ifndef CK_CODE_GEN_RTC #include #include +#endif #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp" @@ -53,12 +54,15 @@ constexpr auto GridwiseGemmPipeline_Selector() } else { +#ifndef CK_CODE_GEN_RTC std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl; +#endif } } } // namespace ck +#ifndef CK_CODE_GEN_RTC inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) { switch(p) @@ -71,3 +75,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) } return os; } +#endif diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index bb1871ae62..21315c2567 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -212,7 +212,7 @@ template ::type = false> struct ThreadwiseTensorSliceTransfer_v2 { - static_assert((InvalidElementAsNaN && !std::is_integral::value) || + static_assert((InvalidElementAsNaN && !ck::is_integral::value) || (!InvalidElementAsNaN), "Filling invalid element as NaN is only for floating point types"); diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index b91b12ad52..3db94deccb 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -1,10 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "ck/library/utility/numeric.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -148,8 +147,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -201,11 +200,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I0]}, ZYX_{X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -219,8 +222,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -272,11 +275,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I1]}, ZYX_{Y_ * X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -290,8 +297,8 @@ struct TransformConvFwdToGemm template ::type = false> + index_t NDim = NDimSpatial, + typename ck::enable_if::type = false> __host__ __device__ TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths, const ConvDimsType& a_g_n_c_wis_strides, const ConvDimsType& b_g_k_c_xs_lengths, @@ -343,11 +350,15 @@ struct TransformConvFwdToGemm InRightPadW_{input_right_pads[I2]}, ZYX_{Z_ * Y_ * X_} { +#ifdef CK_CODE_GEN_RTC + static_assert(is_same_v>); + static_assert(is_same_v>); +#else static_assert(is_same_v> || is_same_v>); static_assert(is_same_v> || is_same_v>); - +#endif if constexpr(SplitN) { N_ = GetSplitedNSize( @@ -478,11 +489,11 @@ struct TransformConvFwdToGemm // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as // properties template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeADescriptor_M_K() const { if constexpr(ConvForwardSpecialization == @@ -691,11 +702,11 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeADescriptor_M_K() const { @@ -932,7 +943,7 @@ struct TransformConvFwdToGemm } template || is_same_v || is_same_v), @@ -1242,19 +1253,19 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v, - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v, + bool>::type = false> __host__ __device__ auto MakeBDescriptor_N_K() const { if constexpr(ConvForwardSpecialization == device::ConvolutionForwardSpecialization::Filter3x3) { using FilterSizeNumType = - std::conditional_t, - std::conditional_t, Number<27>>>; + ck::conditional_t, + ck::conditional_t, Number<27>>>; if constexpr(NumGroupsToMerge == 1) { @@ -1297,13 +1308,13 @@ struct TransformConvFwdToGemm template < typename BLayout, - typename std::enable_if || - is_same_v || - is_same_v || - is_same_v || - is_same_v || - is_same_v, - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v, + bool>::type = false> __host__ __device__ auto MakeBDescriptor_N_K() const { const auto wei_k_yx_c_desc = make_naive_tensor_descriptor( @@ -1318,36 +1329,36 @@ struct TransformConvFwdToGemm return wei_gemmn_gemmk_desc; } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_), make_tuple(I0, KStrideTensorC_)); } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_), make_tuple(I0, KStrideTensorC_)); } - template ), - bool>::type = false> + typename ck::enable_if), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_), @@ -1355,12 +1366,12 @@ struct TransformConvFwdToGemm } template || - is_same_v || - is_same_v), - bool>::type = false> + index_t NDimSp = NDimSpatial, + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { const IndexType NDoHoWo = N_ * Wo_; @@ -1410,11 +1421,11 @@ struct TransformConvFwdToGemm template || - is_same_v || - is_same_v), - bool>::type = false> + typename ck::enable_if || + is_same_v || + is_same_v), + bool>::type = false> __host__ __device__ auto MakeCDescriptor_M_N() const { const IndexType NDoHoWo = N_ * Ho_ * Wo_; @@ -1467,7 +1478,7 @@ struct TransformConvFwdToGemm template || is_same_v || is_same_v), diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index ad13c44311..534a01e083 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "data_type.hpp" @@ -1021,15 +1021,24 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; static_assert(bytes_per_thread == dword_bytes); +#ifndef CK_CODE_GEN_RTC const uint32_t* global_ptr = reinterpret_cast(reinterpret_cast(global_base_ptr)); +#else + const uint32_t* global_ptr = + reinterpret_cast(reinterpret_cast(global_base_ptr)); +#endif const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size); const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM T* lds_ptr = lds_base_ptr + lds_offset; +#ifndef CK_CODE_GEN_RTC auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); +#else + auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); +#endif asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), @@ -1038,8 +1047,13 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, #else // LDS pointer must be attributed with the LDS address space. __attribute__((address_space(3))) uint32_t* lds_ptr = +#ifndef CK_CODE_GEN_RTC reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( reinterpret_cast(lds_base_ptr + lds_offset)); +#else + reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( + reinterpret_cast(lds_base_ptr + lds_offset)); +#endif llvm_amdgcn_raw_buffer_load_lds( src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0); diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp index e9174904c9..b4838277f1 100644 --- a/include/ck/utility/amd_ck_fp8.hpp +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -1,8 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/ck.hpp" +#include "ck/utility/enable_if.hpp" #include "ck/utility/random_gen.hpp" #include "ck/utility/type.hpp" @@ -424,9 +426,9 @@ __host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a) } template || std::is_same_v || - std::is_same_v || std::is_same_v, - bool> = true> + ck::enable_if_t || is_same_v || + is_same_v || is_same_v, + bool> = true> __host__ __device__ static inline constexpr bool fp8_is_inf(T) { return false; @@ -823,7 +825,11 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f) if constexpr(stochastic_rounding) { constexpr int seed = 1254739; - rng = prand_generator(reinterpret_cast(&f), f); +#ifndef CK_CODE_GEN_RTC + rng = prand_generator(reinterpret_cast(&f), f); +#else + rng = prand_generator(reinterpret_cast(&f), f); +#endif } return cast_to_f8_from_f32( f, rng); @@ -839,7 +845,11 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f) if constexpr(stochastic_rounding) { constexpr int seed = 1254739; +#ifndef CK_CODE_GEN_RTC rng = prand_generator(reinterpret_cast(&f), f); +#else + rng = prand_generator(reinterpret_cast(&f), f); +#endif } if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ) diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp index d6e1eab314..128c8e9a2c 100644 --- a/include/ck/utility/amd_wave_read_first_lane.hpp +++ b/include/ck/utility/amd_wave_read_first_lane.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -7,10 +7,12 @@ #include "ck/utility/functional2.hpp" #include "ck/utility/math.hpp" +#ifndef CK_CODE_GEN_RTC #include #include #include #include +#endif namespace ck { namespace detail { @@ -37,7 +39,7 @@ struct get_carrier<3> { using value_type = uint32_t; - std::array bytes; + Array bytes; static_assert(sizeof(bytes) <= sizeof(value_type)); // replacement of host std::copy_n() @@ -61,22 +63,22 @@ struct get_carrier<3> // method to trigger template substitution failure __device__ carrier(const carrier& other) noexcept { - copy_n(other.bytes.begin(), bytes.size(), bytes.begin()); + copy_n(other.bytes.begin(), bytes.Size(), bytes.begin()); } public: __device__ carrier& operator=(value_type value) noexcept { - copy_n(reinterpret_cast(&value), bytes.size(), bytes.begin()); + copy_n(reinterpret_cast(&value), bytes.Size(), bytes.begin()); return *this; } __device__ operator value_type() const noexcept { - std::byte result[sizeof(value_type)]; + ck::byte result[sizeof(value_type)]; - copy_n(bytes.begin(), bytes.size(), result); + copy_n(bytes.begin(), bytes.Size(), result); return *reinterpret_cast(result); } @@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) { constexpr unsigned object_size = sizeof(int64_t); constexpr unsigned second_part_offset = object_size / 2; - auto* const from_obj = reinterpret_cast(&value); - alignas(int64_t) std::byte to_obj[object_size]; + auto* const from_obj = reinterpret_cast(&value); + alignas(int64_t) ck::byte to_obj[object_size]; using Sgpr = uint32_t; @@ -122,17 +124,16 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value) return *reinterpret_cast(to_obj); } -template < - typename Object, - typename = std::enable_if_t && std::is_trivially_copyable_v>> +template && ck::is_trivially_copyable_v>> __device__ auto amd_wave_read_first_lane(const Object& obj) { using Size = unsigned; constexpr Size SgprSize = 4; constexpr Size ObjectSize = sizeof(Object); - auto* const from_obj = reinterpret_cast(&obj); - alignas(Object) std::byte to_obj[ObjectSize]; + auto* const from_obj = reinterpret_cast(&obj); + alignas(Object) ck::byte to_obj[ObjectSize]; constexpr Size RemainedSize = ObjectSize % SgprSize; constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp index 5366c56a9d..2afad00d49 100644 --- a/include/ck/utility/array.hpp +++ b/include/ck/utility/array.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_ARRAY_HPP #define CK_ARRAY_HPP @@ -38,6 +38,8 @@ struct Array } __host__ __device__ constexpr const TData* begin() const { return &mData[0]; } __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; } + __host__ __device__ constexpr TData* begin() { return &mData[0]; } + __host__ __device__ constexpr TData* end() { return &mData[NSize]; } }; // empty Array @@ -54,7 +56,7 @@ template __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs) { using data_type = remove_cvref_t; - return Array{std::forward(x), std::forward(xs)...}; + return Array{ck::forward(x), ck::forward(xs)...}; } // make empty array diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp index 9c7b954565..bd0ca42ecd 100644 --- a/include/ck/utility/container_helper.hpp +++ b/include/ck/utility/container_helper.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_CONTAINER_HELPER_HPP #define CK_CONTAINER_HELPER_HPP @@ -326,14 +326,14 @@ template __host__ __device__ constexpr auto container_concat(const Array& ax, const Array& ay) { return unpack2( - [&](auto&&... zs) { return make_array(std::forward(zs)...); }, ax, ay); + [&](auto&&... zs) { return make_array(ck::forward(zs)...); }, ax, ay); } template __host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) { return unpack2( - [&](auto&&... zs) { return make_tuple(std::forward(zs)...); }, tx, ty); + [&](auto&&... zs) { return make_tuple(ck::forward(zs)...); }, tx, ty); } template diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index d9c954c50f..882d661331 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -5,9 +5,21 @@ #include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/statically_indexed_array.hpp" - +#ifdef CK_CODE_GEN_RTC +using int8_t = signed char; +using uint8_t = unsigned char; +using int16_t = signed short; +using uint16_t = unsigned short; +using float_t = float; +#endif namespace ck { +#ifdef CK_CODE_GEN_RTC +using byte = unsigned char; +#else +using std::byte; +#endif + using bhalf_t = ushort; using half_t = _Float16; using int4_t = _BitInt(4); @@ -217,7 +229,7 @@ struct scalar_type }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using type = d1_t; @@ -253,7 +265,7 @@ struct vector_type()>> __device__ int static err = 0; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -313,7 +325,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -383,7 +395,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -453,7 +465,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d4_t __attribute__((ext_vector_type(4))); @@ -523,7 +535,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -605,7 +617,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -687,7 +699,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d4_t __attribute__((ext_vector_type(4))); @@ -769,7 +781,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -863,7 +875,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -967,7 +979,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1083,7 +1095,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1209,7 +1221,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; typedef T d2_t __attribute__((ext_vector_type(2))); @@ -1374,7 +1386,7 @@ template struct non_native_vector_base< T, N, - std::enable_if_t> + ck::enable_if_t> { using data_t = typename nnvb_data_t_selector::type; // select data_t based on the size of T static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch"); @@ -1499,7 +1511,7 @@ struct scalar_type> // non-native vector_type implementation template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1550,7 +1562,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1613,7 +1625,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1686,7 +1698,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1771,7 +1783,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d1_nnv_t = non_native_vector_base; @@ -1866,7 +1878,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d2_t = non_native_vector_base; @@ -1970,7 +1982,7 @@ struct vector_type()>> }; template -struct vector_type()>> +struct vector_type()>> { using d1_t = T; using d2_t = non_native_vector_base; @@ -2210,20 +2222,230 @@ using pk_i4x2_t = typename vector_type::type; using pk_i4x4_t = typename vector_type::type; using pk_i4x8_t = typename vector_type::type; +#ifdef CK_CODE_GEN_RTC +template +struct NumericLimits; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; } + + __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; } + + __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; } + + __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int32_t QuietNaN() { return 0; } +}; +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; } + + __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; } + + __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; } + + __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int16_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; } + + __host__ __device__ static constexpr int8_t Min() noexcept { return -128; } + + __host__ __device__ static constexpr int8_t Max() noexcept { return 127; } + + __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr int8_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; } + + __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; } + + __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; } + + __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; } +}; + +template <> +struct NumericLimits +{ + static constexpr unsigned int binary_min = 0x00800000; + static constexpr unsigned int binary_max = 0x7F7FFFFF; + static constexpr unsigned int binary_lowest = 0xFF7FFFFF; + static constexpr unsigned int binary_qnan = 0xFFC00001; + static constexpr unsigned int binary_inf = 0x7F8000000; + + __host__ __device__ static constexpr float Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr float Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr float Lowest() { return bit_cast(binary_lowest); } + + __host__ __device__ static constexpr float QuietNaN() { return bit_cast(binary_qnan); } + + __host__ __device__ static constexpr float Infinity() { return bit_cast(binary_inf); } +}; + +template <> +struct NumericLimits +{ + static constexpr unsigned short binary_min = 0x0400; + static constexpr unsigned short binary_max = 0x7BFF; + static constexpr unsigned short binary_lowest = 0xFBFF; + static constexpr unsigned short binary_qnan = 0x7FFF; + + __host__ __device__ static constexpr half_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr half_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr half_t Lowest() { return bit_cast(binary_lowest); } + + __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast(binary_qnan); } +}; + +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +template <> +struct NumericLimits +{ + __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); } + + __host__ __device__ static constexpr int4_t Max() { return int4_t(7); } + + __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); } +}; +#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + +template <> +struct NumericLimits +{ + // negative zero nan mode with exp bias = 8 + static constexpr uint8_t binary_min = 0x08; // 0b00001000 + static constexpr uint8_t binary_max = 0x7F; // 0b01111111 + static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111 + static constexpr uint8_t binary_qnan = 0x80; // 0b10000000 + // ieee mode with exp bias = 7 + // static constexpr uint8_t binary_min = 0x08; // 0b00001000 + // static constexpr uint8_t binary_max = 0x77; // 0b01110111 + // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111 + // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!=0 + + __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); } + + __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); } + + __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); } + + __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + // negative zero nan mode with exp bias = 16 + static constexpr uint8_t binary_min = 0x04; // 0b00000100 + static constexpr uint8_t binary_max = 0x7F; // 0b01111111 + static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111 + static constexpr uint8_t binary_qnan = 0x80; // 0b10000000 + // ieee mode with exp bias = 15 + // static constexpr uint8_t binary_min = 0x04; // 0b00000100 + // static constexpr uint8_t binary_max = 0x7B; // 0b01111011 + // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 + // static constexpr uint8_t binary_qnan = 0x79; // any sign, exp=1111, mant!= + + __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); } + + __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); } + + __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); } + + __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x08; // 0b00001000 = 2^-6 + static constexpr uint8_t binary_max = 0x7E; // 0b01111110 = 448 + static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448 + static constexpr uint8_t binary_qnan = 0x7F; // 0b01111111 + + __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr f8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr f8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; + +template <> +struct NumericLimits +{ + static constexpr uint8_t binary_min = 0x04; // 0b00000100 = 2^-14 + static constexpr uint8_t binary_max = 0x7B; // 0b01111011 = 57344 + static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344 + static constexpr uint8_t binary_qnan = 0x7D; // 0b01111101 + + __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast(binary_min); } + + __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast(binary_max); } + + __host__ __device__ static constexpr bf8_ocp_t Lowest() + { + return bit_cast(binary_lowest); + } + + __host__ __device__ static constexpr bf8_ocp_t QuietNaN() + { + return bit_cast(binary_qnan); + } +}; +#else template struct NumericLimits { __host__ __device__ static constexpr T Min() { return std::numeric_limits::min(); } - __host__ __device__ static constexpr T Max() { return std::numeric_limits::max(); } - __host__ __device__ static constexpr T Lowest() { return std::numeric_limits::lowest(); } - __host__ __device__ static constexpr T QuietNaN() { return std::numeric_limits::quiet_NaN(); } - __host__ __device__ static constexpr T Infinity() { return std::numeric_limits::infinity(); } }; @@ -2347,6 +2569,7 @@ struct NumericLimits return bit_cast(binary_qnan); } }; +#endif template struct NumericUtils diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp index 03c4e16dd6..2b247cc02a 100644 --- a/include/ck/utility/debug.hpp +++ b/include/ck/utility/debug.hpp @@ -1,8 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef UTILITY_DEBUG_HPP #define UTILITY_DEBUG_HPP +#include "type.hpp" namespace ck { namespace debug { diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp index c0a3c99f1f..6ba63fc761 100644 --- a/include/ck/utility/enable_if.hpp +++ b/include/ck/utility/enable_if.hpp @@ -1,14 +1,31 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once namespace ck { +#ifndef CK_CODE_GEN_RTC template using enable_if = std::enable_if; template using enable_if_t = typename std::enable_if::type; +#else +template +struct enable_if +{ +}; + +template +struct enable_if +{ + using type = T; +}; + +template +using enable_if_t = typename enable_if::type; +#endif + } // namespace ck diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp index 6455402dcb..809f302f74 100644 --- a/include/ck/utility/env.hpp +++ b/include/ck/utility/env.hpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +#ifndef CK_CODE_GEN_RTC #pragma once #include @@ -183,3 +184,4 @@ void UpdateEnvVar(EnvVar, const std::string_view& val) } } // namespace ck +#endif diff --git a/include/ck/utility/functional.hpp b/include/ck/utility/functional.hpp index 91797d2409..cd48ed1747 100644 --- a/include/ck/utility/functional.hpp +++ b/include/ck/utility/functional.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y) { if constexpr(predicate) { - return std::forward(x); + return ck::forward(x); } else { - return std::forward(y); + return ck::forward(y); } } diff --git a/include/ck/utility/functional4.hpp b/include/ck/utility/functional4.hpp index b5f3df8d7c..8e86a296dc 100644 --- a/include/ck/utility/functional4.hpp +++ b/include/ck/utility/functional4.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #ifndef CK_FUNCTIONAL4_HPP #define CK_FUNCTIONAL4_HPP @@ -21,7 +21,7 @@ struct unpack_impl> template __host__ __device__ constexpr auto operator()(F&& f, X&& x) const { - return std::forward(f)(std::forward(x).At(Number{})...); + return ck::forward(f)(ck::forward(x).At(Number{})...); } }; @@ -35,8 +35,8 @@ struct unpack2_impl, Sequence> template __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const { - return std::forward(f)(std::forward(x).At(Number{})..., - std::forward(y).At(Number{})...); + return ck::forward(f)(ck::forward(x).At(Number{})..., + ck::forward(y).At(Number{})...); } }; @@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x) { using X_ = remove_reference_t; return detail::unpack_impl::type>{}( - std::forward(f), std::forward(x)); + ck::forward(f), ck::forward(x)); } // TODO: properly implement unpack that takes any number of containers @@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y) using Y_ = remove_reference_t; return detail::unpack2_impl::type, typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}( - std::forward(f), std::forward(x), std::forward(y)); + ck::forward(f), ck::forward(x), ck::forward(y)); } } // namespace ck diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp index 376070eb3d..75f35d762c 100644 --- a/include/ck/utility/integral_constant.hpp +++ b/include/ck/utility/integral_constant.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -48,4 +48,9 @@ __host__ __device__ constexpr auto operator%(integral_constant, integral_ return integral_constant{}; } +template +using bool_constant = integral_constant; + +using true_type = bool_constant; +using false_type = bool_constant; } // namespace ck diff --git a/include/ck/utility/is_detected.hpp b/include/ck/utility/is_detected.hpp index 7a324a6c45..a700fcfff1 100644 --- a/include/ck/utility/is_detected.hpp +++ b/include/ck/utility/is_detected.hpp @@ -1,22 +1,24 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/utility/integral_constant.hpp" + namespace ck { namespace detail { template class Op, class... Args> struct detector { - using value_t = std::false_type; + using value_t = integral_constant; using type = Default; }; template class Op, class... Args> -struct detector>, Op, Args...> +struct detector>, Op, Args...> { - using value_t = std::true_type; + using value_t = integral_constant; using type = Op; }; } // namespace detail @@ -32,12 +34,12 @@ template