Introduce gemm_softmax_gemm to codegen.

2026-06-06 15:54:31 +00:00 · 2024-09-25 08:22:07 +00:00
parent 3528a523ff
commit d43cd4ad32
52 changed files with 2108 additions and 187 deletions
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,16 +4,12 @@
 #pragma once

 #include "ck/config.h"
-#include "ck/utility/env.hpp"
-
+#ifndef __HIPCC_RTC__
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
-
-// environment variable to enable logging:
-// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
-CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+#endif

 // to do: add various levels of logging with CK_LOG_LEVEL

--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <string>
 #include <map>
 #include <hip/hip_runtime.h>
@@ -96,3 +97,4 @@ inline bool is_gfx12_supported()
 }

 } // namespace ck
+#endif
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <hip/hip_runtime.h>

 #include "ck/ck.hpp"
@@ -160,3 +160,4 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    return 0;
 #endif
 }
+#endif
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -3,15 +3,17 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <string>
 #include <sstream>
-
 #include "ck/stream_config.hpp"
+#endif

 namespace ck {
 namespace tensor_operation {
 namespace device {

+#ifndef __HIPCC_RTC__
 struct BaseArgument
 {
    BaseArgument()                    = default;
@@ -36,6 +38,7 @@ struct BaseInvoker

    virtual ~BaseInvoker() {}
 };
+#endif

 struct BaseOperator
 {
@@ -43,6 +46,7 @@ struct BaseOperator
    BaseOperator(const BaseOperator&) = default;
    BaseOperator& operator=(const BaseOperator&) = default;

+#ifndef __HIPCC_RTC__
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }

@@ -66,7 +70,7 @@ struct BaseOperator
        assert(p_arg);
        p_arg->p_workspace_ = p_workspace;
    }
-
+#endif
    virtual ~BaseOperator() {}
 };

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -2,9 +2,10 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <vector>
+#endif

 #include "device_base.hpp"

@@ -28,6 +29,7 @@ template <typename ALayout,
          bool MaskOutUpperTriangle> // TODO: enum for mask type
 struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
 {
+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b0,
@@ -53,6 +55,7 @@ struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
                        CElementwiseOperation c_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };

 } // namespace device
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -2,9 +2,11 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <array>
+#endif

+#include "ck/utility/array.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"

 namespace ck {
@@ -34,6 +36,7 @@ struct DeviceGemmMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
@@ -51,6 +54,7 @@ struct DeviceGemmMultipleD : public BaseOperator
                        CDEElementwiseOperation cde_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };

 // GEMM:
@@ -76,6 +80,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
@@ -94,6 +99,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
                        CDEElementwiseOperation cde_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };

 } // namespace device
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -28,7 +28,7 @@ enum struct GemmSpecialization
    NKOPadding,
    MNKOPadding,
 };
-
+#ifndef __HIPCC_RTC__
 inline std::string getGemmSpecializationString(const GemmSpecialization& s)
 {
    switch(s)
@@ -52,6 +52,7 @@ inline std::string getGemmSpecializationString(const GemmSpecialization& s)
    default: return "Unrecognized specialization!";
    }
 }
+#endif

 } // namespace device
 } // namespace tensor_operation
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -15,8 +19,6 @@
 #include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -40,27 +42,27 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatAB* __restrict__ p_b1_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const AccElementwiseOperation acc_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const CElementwiseOperation c_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
-            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
-            const C0MatrixMask c0_matrix_mask)
+    kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        const FloatAB* __restrict__ p_b1_grid,
+        FloatC* __restrict__ p_c_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const AccElementwiseOperation acc_element_op,
+        const B1ElementwiseOperation b1_element_op,
+        const CElementwiseOperation c_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2CTileMap block_2_ctile_map,
+        const index_t batch_count,
+        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+        const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
@@ -430,6 +432,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        matrix_padder.PadN,
        MaskOutUpperTriangle>;

+#ifndef __HIPCC_RTC__
    // Argument
    struct Argument : public BaseArgument
    {
@@ -604,6 +607,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
        }
    };
+#endif

    static constexpr bool IsValidCompilationParameter()
    {
@@ -611,6 +615,97 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        return true;
    }

+    static constexpr bool
+    IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
+    {
+        // check vector load/store
+        using Row = ck::tensor_layout::gemm::RowMajor;
+        using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+        // check vector load of A
+        if constexpr(is_same_v<ALayout, Row>)
+        {
+            if(KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else if constexpr(is_same_v<ALayout, Col>)
+        {
+            if(MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector load of B
+        if constexpr(is_same_v<BLayout, Row>)
+        {
+            if(NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else if constexpr(is_same_v<BLayout, Col>)
+        {
+            if(KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector load of B1
+        if constexpr(is_same_v<B1Layout, Row>)
+        {
+            if(Gemm1NRaw_ % B1BlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else if constexpr(is_same_v<B1Layout, Col>)
+        {
+            if(NRaw_ % B1BlockTransferSrcScalarPerVector != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector load of C
+        if constexpr(is_same_v<CLayout, Row>)
+        {
+            if(Gemm1NRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+        else if constexpr(is_same_v<CLayout, Col>)
+        {
+            if(MRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+#ifndef __HIPCC_RTC__
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(!ck::is_xdl_supported())
@@ -765,8 +860,271 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle

        return str.str();
    }
+#endif
+
+    template <class ADesc, class BDesc, class B1Desc, class CDesc>
+    struct Descriptor
+    {
+        template <class AGridDescriptor>
+        static constexpr auto MakeAGridDescriptor_AK0_M_AK1(const AGridDescriptor& a_grid_desc)
+        {
+            const auto a_grid_desc_m_k = DeviceOp::matrix_padder.PadADescriptor_M_K(a_grid_desc);
+
+            const auto M = a_grid_desc_m_k.GetLength(I0);
+            const auto K = a_grid_desc_m_k.GetLength(I1);
+
+            const auto AK0 = K / AK1;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+
+        template <class BGridDescriptor>
+        static constexpr auto MakeBGridDescriptor_BK0_N_BK1(const BGridDescriptor& b_grid_desc)
+        {
+            const auto b_grid_desc_n_k = DeviceOp::matrix_padder.PadBDescriptor_N_K(b_grid_desc);
+
+            const auto N = b_grid_desc_n_k.GetLength(I0);
+            const auto K = b_grid_desc_n_k.GetLength(I1);
+
+            const auto BK0 = K / BK1;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+
+        template <class B1GridDescriptor>
+        static constexpr auto MakeB1GridDescriptor_BK0_N_BK1(const B1GridDescriptor& b1_grid_desc)
+        {
+            const auto b1_grid_desc_n_k = DeviceOp::matrix_padder.PadB1Descriptor_N_K(b1_grid_desc);
+
+            const auto N = b1_grid_desc_n_k.GetLength(I0);
+            const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+            const auto B1K0 = K / B1K1;
+
+            return transform_tensor_descriptor(
+                b1_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+
+        template <class CGridDescriptor>
+        static constexpr auto MakeCGridDescriptor_M_N(const CGridDescriptor& c_grid_desc)
+        {
+            return DeviceOp::matrix_padder.PadCDescriptor_M_N(c_grid_desc);
+        }
+
+        using AGridDesc_AK0_M_AK1 =
+            remove_cvref_t<decltype(MakeAGridDescriptor_AK0_M_AK1(ADesc{}))>;
+        using BGridDesc_BK0_N_BK1 =
+            remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1(BDesc{}))>;
+        using B1GridDesc_BK0_N_BK1 =
+            remove_cvref_t<decltype(MakeB1GridDescriptor_BK0_N_BK1(B1Desc{}))>;
+        using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(CDesc{}))>;
+
+        // GridwiseGemm
+        using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+            ADataType, // TODO: distinguish A/B datatype
+            GemmAccDataType,
+            CShuffleDataType,
+            CDataType,
+            AElementwiseOperation,
+            BElementwiseOperation,
+            AccElementwiseOperation,
+            B1ElementwiseOperation,
+            CElementwiseOperation,
+            InMemoryDataOperationEnum::Set,
+            AGridDesc_AK0_M_AK1,
+            BGridDesc_BK0_N_BK1,
+            B1GridDesc_BK0_N_BK1,
+            CGridDesc_M_N,
+            NumGemmKPrefetchStage,
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            Gemm1NPerBlock,
+            Gemm1KPerBlock,
+            AK1,
+            BK1,
+            B1K1,
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            Gemm1NXdlPerWave,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            true,
+            ABlockLdsExtraM,
+            BBlockTransferThreadClusterLengths_BK0_N_BK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_BK1,
+            true,
+            BBlockLdsExtraN,
+            B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+            B1BlockTransferThreadClusterArrangeOrder,
+            B1BlockTransferSrcAccessOrder,
+            B1BlockTransferSrcVectorDim,
+            B1BlockTransferSrcScalarPerVector,
+            B1BlockTransferDstScalarPerVector_BK1,
+            false,
+            B1BlockLdsExtraN,
+            CShuffleMXdlPerWavePerShuffle,
+            CShuffleNXdlPerWavePerShuffle,
+            CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+            CShuffleBlockTransferScalarPerVector_NPerBlock,
+            LoopSched,
+            matrix_padder.PadN,
+            MaskOutUpperTriangle>;
+
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1;
+        CGridDesc_M_N c_grid_desc_m_n;
+        C0MatrixMask c0_matrix_mask;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_descriptor_mblock_mperblock_nblock_nperblock;
+
+        // element-wise op
+        AElementwiseOperation a_element_op;
+        BElementwiseOperation b_element_op;
+        B1ElementwiseOperation b1_element_op;
+        CElementwiseOperation c_element_op;
+
+        bool has_main_k_block_loop = true;
+        bool is_valid              = false;
+
+        constexpr Descriptor(ADesc a,
+                             BDesc b,
+                             B1Desc b1,
+                             CDesc c,
+                             AElementwiseOperation a_element_op_,
+                             BElementwiseOperation b_element_op_,
+                             B1ElementwiseOperation b1_element_op_,
+                             CElementwiseOperation c_element_op_)
+            : a_grid_desc_ak0_m_ak1{MakeAGridDescriptor_AK0_M_AK1(a)},
+              b_grid_desc_bk0_n_bk1{MakeBGridDescriptor_BK0_N_BK1(b)},
+              b1_grid_desc_bk0_n_bk1{MakeB1GridDescriptor_BK0_N_BK1(b1)},
+              c_grid_desc_m_n{MakeCGridDescriptor_M_N(c)},
+              block_2_ctile_map{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n)},
+              c_grid_descriptor_mblock_mperblock_nblock_nperblock{
+                  GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                      c_grid_desc_m_n)},
+              has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
+                  a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
+              c0_matrix_mask{c.GetLength(I1)},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              b1_element_op{b1_element_op_},
+              c_element_op{c_element_op_},
+              is_valid{GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1,
+                                                   b_grid_desc_bk0_n_bk1,
+                                                   b1_grid_desc_bk0_n_bk1,
+                                                   c_grid_desc_m_n,
+                                                   block_2_ctile_map) and
+                       IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
+                                   b_grid_desc_bk0_n_bk1.GetLength(I1),
+                                   a_grid_desc_ak0_m_ak1.GetLength(I0) *
+                                       a_grid_desc_ak0_m_ak1.GetLength(I2),
+                                   b1_grid_desc_bk0_n_bk1.GetLength(I1))}
+        {
+        }
+
+        constexpr bool IsValid() const { return is_valid; }
+    };
+
+    template <class ADesc, class BDesc, class B1Desc, class CDesc>
+    static constexpr auto
+    make_descriptor(ADesc a,
+                    BDesc b,
+                    B1Desc b1,
+                    CDesc c,
+                    AElementwiseOperation a_element_op   = AElementwiseOperation{},
+                    BElementwiseOperation b_element_op   = BElementwiseOperation{},
+                    B1ElementwiseOperation b1_element_op = B1ElementwiseOperation{},
+                    CElementwiseOperation c_element_op   = CElementwiseOperation{})
+    {
+        return Descriptor<ADesc, BDesc, B1Desc, CDesc>(
+            a, b, b1, c, a_element_op, b_element_op, b1_element_op, c_element_op);
+    }
+
+    template <class Desc>
+    __device__ static void Run(const Desc& desc,
+                               const float scale,
+                               const ADataType* __restrict__ p_a_grid,
+                               const ADataType* __restrict__ p_b_grid,
+                               const ADataType* __restrict__ p_b1_grid,
+                               CDataType* __restrict__ p_c_grid)
+    {
+#ifndef __HIPCC_RTC__
+        assert(desc.is_valid);
+#endif
+        __shared__ char p_shared_block[Desc::GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        AccElementwiseOperation acc_element_op{scale};
+
+        if(desc.has_main_k_block_loop)
+        {
+            Desc::GridwiseGemm::template Run<true>(
+                p_a_grid,
+                p_b_grid,
+                p_b1_grid,
+                p_c_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                acc_element_op,
+                desc.b1_element_op,
+                desc.c_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.b1_grid_desc_bk0_n_bk1,
+                desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
+                desc.block_2_ctile_map,
+                desc.c0_matrix_mask);
+        }
+        else
+        {
+            Desc::GridwiseGemm::template Run<false>(
+                p_a_grid,
+                p_b_grid,
+                p_b1_grid,
+                p_c_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                acc_element_op,
+                desc.b1_element_op,
+                desc.c_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.b1_grid_desc_bk0_n_bk1,
+                desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
+                desc.block_2_ctile_map,
+                desc.c0_matrix_mask);
+        }
+    }
 };

 } // namespace device
 } // namespace tensor_operation
-} // namespace ck
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -14,8 +18,6 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"

 namespace ck {

@@ -35,22 +37,22 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
-                                            const BDataType* __restrict__ p_b_grid,
-                                            DsPointer p_ds_grid,
-                                            EDataType* __restrict__ p_e_grid,
-                                            const AElementwiseOperation a_element_op,
-                                            const BElementwiseOperation b_element_op,
-                                            const CDEElementwiseOperation cde_element_op,
-                                            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                                e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                            const Block2ETileMap block_2_etile_map)
+    kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
+                                        const BDataType* __restrict__ p_b_grid,
+                                        DsPointer p_ds_grid,
+                                        EDataType* __restrict__ p_e_grid,
+                                        const AElementwiseOperation a_element_op,
+                                        const BElementwiseOperation b_element_op,
+                                        const CDEElementwiseOperation cde_element_op,
+                                        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                            e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
@@ -225,9 +227,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
    }

-    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                                         const std::array<index_t, NumDTensor>& NRaws,
-                                         const std::array<index_t, NumDTensor>& DsStride)
+    static auto MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
+                                         const Array<index_t, NumDTensor>& NRaws,
+                                         const Array<index_t, NumDTensor>& DsStride)
    {
        return generate_tuple(
            [&](auto i) {
@@ -309,6 +311,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
    using Block2ETileMap =
        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;

+#ifndef __HIPCC_RTC__
    // Argument
    struct Argument : public BaseArgument
    {
@@ -498,6 +501,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        }
    };

+#endif
+
    static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
    {
        // check vector load/store
@@ -578,6 +583,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        return true;
    }

+#ifndef __HIPCC_RTC__
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(!ck::is_xdl_supported())
@@ -676,11 +682,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
    {
        auto str = std::stringstream();

-        std::map<LoopScheduler, std::string> LoopSchedToString{
-            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<LoopScheduler, std::string> LoopSchedToString{{LoopScheduler::Default, "Default"},
+                                                               { LoopScheduler::Interwave,
+                                                                 "Interwave" }};

        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
-                                                                       {PipelineVersion::v2, "v2"}};
+                                                                       { PipelineVersion::v2,
+                                                                         "v2" }};

        // clang-format off
        str << "DeviceGemmMultipleD_Xdl_CShuffle"
@@ -709,6 +717,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,

        return str.str();
    }
+#endif

    template <class ADesc, class BDesc, class DsDesc, class EDesc>
    struct Descriptor
@@ -847,7 +856,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                               EDataType* __restrict__ p_e_grid)
    {
        __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#ifndef __HIPCC_RTC__
        assert(desc.IsValid());
+#endif
        if(desc.has_main_k_block_loop)
        {
            GridwiseGemm::template Run<true>(p_a_grid,
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -13,6 +13,7 @@ enum struct MaskingSpecialization
    MaskOutUpperTriangle
 };

+#ifndef __HIPCC_RTC__
 inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
 {
    switch(s)
@@ -22,6 +23,7 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
    default: return "Unrecognized specialization!";
    }
 }
+#endif

 struct MaskDisabledPredicate
 {
@@ -53,7 +55,7 @@ struct MaskOutUpperTrianglePredicate
 template <typename MaskOutPredicate>
 struct C0MatrixMask_impl
 {
-    __host__ __device__ C0MatrixMask_impl(index_t NRaw)
+    __host__ __device__ constexpr C0MatrixMask_impl(index_t NRaw)
        : NRaw_(NRaw), predicate_(MaskOutPredicate{})
    {
    }
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout

 } // namespace convolution

+#ifndef __HIPCC_RTC__
 template <
    typename Layout,
    typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
@@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
    os << Layout::name;
    return os;
 }
+#endif

 } // namespace tensor_layout
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -340,8 +340,8 @@ struct Bilinear
    };

    template <>
-    __host__ __device__ constexpr void operator()<std::int8_t, std::int32_t, std::int8_t>(
-        std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const
+    __host__ __device__ constexpr void operator()<int8_t, int32_t, int8_t>(
+        int8_t& y, const int32_t& x0, const int8_t& x1) const
    {
        y = type_convert<int8_t>(alpha_ * type_convert<float>(x0) +
                                 beta_ * type_convert<float>(x1));
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -466,7 +466,7 @@ struct FastGelu

    template <typename Y, typename X>
    __device__ void operator()(Y& y, const X& x) const;
-
+#ifndef __HIPCC_RTC__
    template <>
    __host__ void operator()<float, float>(float& y, const float& x) const
    {
@@ -477,7 +477,7 @@ struct FastGelu
        const float emu = exp(u);
        y               = x / (1.f + emu);
    }
-
+#endif
    // device code, use lower precision "__ocml_exp_f32" and "rcp"
    template <>
    __device__ void operator()<float, float>(float& y, const float& x) const
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -7,8 +7,10 @@
 #include "ck/utility/number.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
+#ifndef __HIPCC_RTC__
 #include <limits>
 #include <stdlib.h>
+#endif

 namespace ck {

@@ -979,7 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit
        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
        const auto N0 = math::integer_divide_ceil(N, NPerBlock);

-        return std::make_tuple(N0, M0, k_split);
+        return ck::make_tuple(N0, M0, k_split);
    }

    template <typename TopIdx>
@@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK
            uint32_t dp_for_sk_iters = k_iters_per_tile.get();

            uint32_t best_sk_score =
-                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+                ck::NumericLimits<int>::Max(); // we need to find the smallest sk iters
            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
                tentative_sk_blocks++)
            {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -475,9 +475,9 @@ struct GridwiseGemmMultipleD_xdl_cshuffle

    template <typename DsLayout, GemmSpecialization GemmSpec>
    __host__ __device__ static auto
-    MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                             const std::array<index_t, NumDTensor>& NRaws,
-                             const std::array<index_t, NumDTensor>& DsStride)
+    MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
+                             const Array<index_t, NumDTensor>& NRaws,
+                             const Array<index_t, NumDTensor>& DsStride)
    {
        return generate_tuple(
            [&](auto i) {
@@ -941,7 +941,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                               const index_t K,
                               const index_t StrideA,
                               const index_t StrideB,
-                               const std::array<index_t, NumDTensor> StrideDs,
+                               const Array<index_t, NumDTensor> StrideDs,
                               const index_t StrideE,
                               const Block2ETileMap& block_2_etile_map)
    {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -3,8 +3,10 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <ostream>
+#endif

 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
@@ -53,12 +55,15 @@ constexpr auto GridwiseGemmPipeline_Selector()
    }
    else
    {
+#ifndef __HIPCC_RTC__
        std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
+#endif
    }
 }

 } // namespace ck

+#ifndef __HIPCC_RTC__
 inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
 {
    switch(p)
@@ -71,3 +76,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
    }
    return os;
 }
+#endif
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1005,6 +1005,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                index_t offset,
                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");

+#ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
 __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                              const index_t global_offset,
@@ -1042,5 +1043,6 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
 #endif
 }
+#endif

 } // namespace ck
--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -7,10 +7,12 @@
 #include "ck/utility/functional2.hpp"
 #include "ck/utility/math.hpp"

+#ifndef __HIPCC_RTC__
 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
+#endif

 namespace ck {
 namespace detail {
@@ -37,7 +39,7 @@ struct get_carrier<3>
    {
        using value_type = uint32_t;

-        std::array<std::byte, 3> bytes;
+        Array<ck::byte, 3> bytes;
        static_assert(sizeof(bytes) <= sizeof(value_type));

        // replacement of host std::copy_n()
@@ -61,22 +63,22 @@ struct get_carrier<3>
        // method to trigger template substitution failure
        __device__ carrier(const carrier& other) noexcept
        {
-            copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
+            copy_n(other.bytes.begin(), bytes.Size(), bytes.begin());
        }

        public:
        __device__ carrier& operator=(value_type value) noexcept
        {
-            copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
+            copy_n(reinterpret_cast<const ck::byte*>(&value), bytes.Size(), bytes.begin());

            return *this;
        }

        __device__ operator value_type() const noexcept
        {
-            std::byte result[sizeof(value_type)];
+            ck::byte result[sizeof(value_type)];

-            copy_n(bytes.begin(), bytes.size(), result);
+            copy_n(bytes.begin(), bytes.Size(), result);

            return *reinterpret_cast<const value_type*>(result);
        }
@@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)
 {
    constexpr unsigned object_size        = sizeof(int64_t);
    constexpr unsigned second_part_offset = object_size / 2;
-    auto* const from_obj                  = reinterpret_cast<const std::byte*>(&value);
-    alignas(int64_t) std::byte to_obj[object_size];
+    auto* const from_obj                  = reinterpret_cast<const ck::byte*>(&value);
+    alignas(int64_t) ck::byte to_obj[object_size];

    using Sgpr = uint32_t;

@@ -124,15 +126,15 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)

 template <
    typename Object,
-    typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
+    typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
 __device__ auto amd_wave_read_first_lane(const Object& obj)
 {
    using Size                = unsigned;
    constexpr Size SgprSize   = 4;
    constexpr Size ObjectSize = sizeof(Object);

-    auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);
-    alignas(Object) std::byte to_obj[ObjectSize];
+    auto* const from_obj = reinterpret_cast<const ck::byte*>(&obj);
+    alignas(Object) ck::byte to_obj[ObjectSize];

    constexpr Size RemainedSize             = ObjectSize % SgprSize;
    constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -38,6 +38,8 @@ struct Array
    }
    __host__ __device__ constexpr const TData* begin() const { return &mData[0]; }
    __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; }
+    __host__ __device__ constexpr TData* begin() { return &mData[0]; }
+    __host__ __device__ constexpr TData* end() { return &mData[NSize]; }
 };

 // empty Array
@@ -54,7 +56,7 @@ template <typename X, typename... Xs>
 __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
 {
    using data_type = remove_cvref_t<X>;
-    return Array<data_type, sizeof...(Xs) + 1>{std::forward<X>(x), std::forward<Xs>(xs)...};
+    return Array<data_type, sizeof...(Xs) + 1>{ck::forward<X>(x), ck::forward<Xs>(xs)...};
 }

 // make empty array
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -326,14 +326,14 @@ template <typename T, index_t NX, index_t NY>
 __host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
 {
    return unpack2(
-        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+        [&](auto&&... zs) { return make_array(ck::forward<decltype(zs)>(zs)...); }, ax, ay);
 }

 template <typename... X, typename... Y>
 __host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
 {
    return unpack2(
-        [&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
+        [&](auto&&... zs) { return make_tuple(ck::forward<decltype(zs)>(zs)...); }, tx, ty);
 }

 template <typename Container>
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -5,8 +5,25 @@

 #include "ck/utility/statically_indexed_array.hpp"

+#ifdef __HIPCC_RTC__
+/// Definitions from <cstdint>, <cmath> conflict with
+/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
+
+using int8_t   = signed char;
+using uint8_t  = unsigned char;
+using int16_t  = signed short;
+using uint16_t = unsigned short;
+using float_t  = float;
+#endif // __HIPCC_RTC__
+
 namespace ck {

+#ifdef __HIPCC_RTC__
+using byte = unsigned char;
+#else
+using std::byte;
+#endif
+
 using bhalf_t = ushort;
 using half_t  = _Float16;
 using int4_t  = _BitInt(4);
@@ -1060,6 +1077,146 @@ using uint8x16_t = typename vector_type<uint8_t, 16>::type;
 using uint8x32_t = typename vector_type<uint8_t, 32>::type;
 using uint8x64_t = typename vector_type<uint8_t, 64>::type;

+#ifdef __HIPCC_RTC__
+template <typename T>
+struct NumericLimits;
+
+template <>
+struct NumericLimits<int32_t>
+{
+    __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
+
+    __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
+
+    __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
+
+    __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<int16_t>
+{
+    __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
+
+    __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
+
+    __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
+
+    __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<int8_t>
+{
+    __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
+
+    __host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
+
+    __host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
+
+    __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<uint32_t>
+{
+    __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
+
+    __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<uint16_t>
+{
+    __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
+
+    __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<float>
+{
+    static constexpr unsigned int binary_min    = 0x00800000;
+    static constexpr unsigned int binary_max    = 0x7F7FFFFF;
+    static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
+    static constexpr unsigned int binary_qnan   = 0xFFC00001;
+    static constexpr unsigned int binary_inf    = 0x7F8000000;
+
+    __host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
+
+    __host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
+
+    __host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
+
+    __host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
+
+    __host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
+};
+
+template <>
+struct NumericLimits<half_t>
+{
+    static constexpr unsigned short binary_min    = 0x0400;
+    static constexpr unsigned short binary_max    = 0x7BFF;
+    static constexpr unsigned short binary_lowest = 0xFBFF;
+    static constexpr unsigned short binary_qnan   = 0x7FFF;
+
+    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
+
+    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
+
+    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+struct NumericLimits<int4_t>
+{
+    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
+
+    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
+
+    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
+};
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+
+template <>
+struct NumericLimits<f8_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    static constexpr uint8_t binary_max    = 0x77; // 0b01110111
+    static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+
+    __host__ __device__ static constexpr f8_t Min() { return bit_cast<f8_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_t Max() { return bit_cast<f8_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_t Lowest() { return bit_cast<f8_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr f8_t QuietNaN() { return bit_cast<f8_t>(binary_qnan); }
+};
+#else
 template <typename T>
 struct NumericLimits
 {
@@ -1151,6 +1308,7 @@ struct NumericLimits<bf8_t>

    __host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); }
 };
+#endif

 template <typename T>
 struct NumericUtils
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
@@ -4,11 +4,26 @@
 #pragma once

 namespace ck {
+#ifdef __HIPCC_RTC__
+template <bool B, class T = void>
+struct enable_if
+{
+};

+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+
+#else
 template <bool B, typename T = void>
 using enable_if = std::enable_if<B, T>;

 template <bool B, typename T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
-
+#endif
 } // namespace ck
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
@@ -183,3 +183,7 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
 }

 } // namespace ck
+
+// environment variable to enable logging:
+// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
+CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
--- a/include/ck/utility/functional.hpp
+++ b/include/ck/utility/functional.hpp
@@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y)
 {
    if constexpr(predicate)
    {
-        return std::forward<X>(x);
+        return ck::forward<X>(x);
    }
    else
    {
-        return std::forward<Y>(y);
+        return ck::forward<Y>(y);
    }
 }

--- a/include/ck/utility/functional4.hpp
+++ b/include/ck/utility/functional4.hpp
@@ -21,7 +21,7 @@ struct unpack_impl<Sequence<Is...>>
    template <typename F, typename X>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
    {
-        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+        return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...);
    }
 };

@@ -35,8 +35,8 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
    template <typename F, typename X, typename Y>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
    {
-        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
-                                  std::forward<Y>(y).At(Number<Js>{})...);
+        return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...,
+                                  ck::forward<Y>(y).At(Number<Js>{})...);
    }
 };

@@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
 {
    using X_ = remove_reference_t<X>;
    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
-        std::forward<F>(f), std::forward<X>(x));
+        ck::forward<F>(f), ck::forward<X>(x));
 }

 // TODO: properly implement unpack that takes any number of containers
@@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
    using Y_ = remove_reference_t<Y>;
    return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
                                typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
-        std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
+        ck::forward<F>(f), ck::forward<X>(x), ck::forward<Y>(y));
 }

 } // namespace ck
--- a/include/ck/utility/is_detected.hpp
+++ b/include/ck/utility/is_detected.hpp
@@ -9,14 +9,14 @@ namespace detail {
 template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
 struct detector
 {
-    using value_t = std::false_type;
+    using value_t = ck::false_type;
    using type    = Default;
 };

 template <class Default, template <class...> class Op, class... Args>
-struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
+struct detector<Default, ck::void_t<Op<Args...>>, Op, Args...>
 {
-    using value_t = std::true_type;
+    using value_t = ck::true_type;
    using type    = Op<Args...>;
 };
 } // namespace detail
@@ -32,12 +32,12 @@ template <template <class...> class Op, class... Args>
 using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;

 template <typename T>
-using is_pack2_invocable_t = decltype(std::declval<T&>().is_pack2_invocable);
+using is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable);

 template <typename T>
-using is_pack4_invocable_t = decltype(std::declval<T&>().is_pack4_invocable);
+using is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable);

 template <typename T>
-using is_pack8_invocable_t = decltype(std::declval<T&>().is_pack8_invocable);
+using is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable);

 } // namespace ck
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-#include <ostream>

 #pragma once
+#ifndef __HIPCC_RTC__
+#include <ostream>
+#endif

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
@@ -26,6 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()

 } // namespace ck

+#ifndef __HIPCC_RTC__
 inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
 {
    switch(s)
@@ -36,3 +39,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
    }
    return os;
 }
+#endif
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -30,7 +30,7 @@ struct MagicDivision
        // WARNING: magic division is only applicable for division inside this range.
        // You should use the return value of CalculateMagicNumbers, if division is not inside this
        // range. The "else" logic below is to quiet down run-time error.
-        if(divisor >= 1 && divisor <= INT32_MAX)
+        if(divisor >= 1 && divisor <= ck::NumericLimits<int32_t>::Max())
        {
            uint32_t shift = 0;
            for(shift = 0; shift < 32; ++shift)
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -18,6 +18,7 @@ namespace math {
 extern "C" __device__ float __ocml_native_recip_f32(float);
 #endif

+#ifndef __HIPCC_RTC__
 // math functions for the host,  some are implemented by calling C++ std functions

 static inline __host__ float abs(float x) { return std::abs(x); };
@@ -457,6 +458,7 @@ inline __host__ double expm1<double>(double x)
 {
    return std::expm1(x);
 }
+#endif

 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions

@@ -920,5 +922,23 @@ inline __device__ double expm1<double>(double x)
    return expm1(x);
 };

+template <typename T>
+inline __device__ T cos(T x)
+{
+    return ck::type_convert<T>(cosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float cos<float>(float x)
+{
+    return cosf(x);
+};
+
+template <>
+inline __device__ double cos<double>(double x)
+{
+    return cos(x);
+};
+
 } // namespace math
 } // namespace ck
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -7,7 +7,7 @@ namespace ck {

 // Pseudo random number generator
 // version for fp32
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<ck::is_same<float, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
@@ -23,7 +23,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // version for fp16
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<ck::is_same<half_t, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
@@ -40,12 +40,18 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 // return 0 if data is not fp16 or fp32
 template <typename T,
          uint32_t seed_t,
-          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+          ck::enable_if_t<!(ck::is_same<float, T>{} || ck::is_same<half_t, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
+    #ifdef __HIPCC_RTC__
+    static_cast<void>(id);
+    static_cast<void>(val);
+    static_cast<void>(seed);
+    #else
    std::ignore = id;
    std::ignore = val;
    std::ignore = seed;
+    #endif

    return 0;
 }
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -3,7 +3,9 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <ostream>
+#endif

 #include "ck/utility/integral_constant.hpp"
 #include "ck/utility/type.hpp"
@@ -900,6 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;

 } // namespace ck

+#ifndef __HIPCC_RTC__
 template <ck::index_t... Is>
 std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
 {
@@ -910,3 +913,4 @@ std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
    os << S::At(S::Size() - ck::Number<1>{}).value << "}";
    return os;
 }
+#endif
--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
@@ -32,7 +32,7 @@ struct TupleElementKeyData
    template <typename T,
              typename enable_if<!is_same<remove_cvref_t<T>, TupleElementKeyData>::value,
                                 bool>::type = false>
-    __host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(std::forward<T>(v))
+    __host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(ck::forward<T>(v))
    {
    }

@@ -67,7 +67,7 @@ get_tuple_element_data_reference(TupleElementKeyData<Key, Data>&& x)
 template <typename Key, typename Data>
 __host__ __device__ constexpr Data get_tuple_element_data(const TupleElementKeyData<Key, Data>& x)
 {
-    return std::forward(x.mData);
+    return ck::forward(x.mData);
 }

 template <typename Indices, typename... Xs>
@@ -83,13 +83,13 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElementKeyData<TupleElementKey<I
                                     !is_same<remove_cvref_t<Y>, TupleImpl>::value,
                                 bool>::type = false>
    __host__ __device__ constexpr TupleImpl(Y&& y)
-        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Y>(y))...
    {
    }

    template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
    __host__ __device__ constexpr TupleImpl(Ys&&... ys)
-        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Ys>(ys))...
    {
        static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
                      "wrong! inconsistent size");
@@ -123,14 +123,14 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
    template <typename Y,
              typename enable_if<sizeof...(Xs) == 1 && !is_same<remove_cvref_t<Y>, Tuple>::value,
                                 bool>::type = false>
-    __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
+    __host__ __device__ constexpr Tuple(Y&& y) : base(ck::forward<Y>(y))
    {
    }

    template <typename... Ys,
              typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
                  false>
-    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
+    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(ck::forward<Ys>(ys)...)
    {
    }

@@ -210,7 +210,7 @@ using tuple_element_t = typename tuple_element<I, TTuple>::type;
 template <typename... Xs>
 __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
 {
-    return Tuple<remove_cvref_t<Xs>...>(std::forward<Xs>(xs)...);
+    return Tuple<remove_cvref_t<Xs>...>(ck::forward<Xs>(xs)...);
 }

 // https://en.cppreference.com/w/cpp/utility/tuple/tie
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -29,7 +29,7 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
                                                             const Tuple<Y&...>& ty)
 {
    return unpack2(
-        [&](auto&&... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
+        [&](auto&&... zs) { return Tuple<decltype(zs)...>{ck::forward<decltype(zs)>(zs)...}; },
        tx,
        ty);
 }
@@ -38,7 +38,7 @@ template <typename... X, typename... Y>
 __host__ __device__ constexpr auto concat_tuple(const Tuple<X...>& tx, const Tuple<Y...>& ty)
 {
    return unpack2(
-        [&](auto... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
+        [&](auto... zs) { return Tuple<decltype(zs)...>{ck::forward<decltype(zs)>(zs)...}; },
        tx,
        ty);
 }
@@ -157,6 +157,7 @@ __host__ __device__ constexpr auto TupleReduce(F&& f, const Tuple<Ts...>& tuple)
    }
 }

+#ifndef __HIPCC_RTC__
 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());

@@ -165,6 +166,7 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
 {
    return (is_detected<is_tuple, Ts>::value || ...);
 }
+#endif

 template <index_t depth = 0, typename T>
 __host__ __device__ constexpr auto TupleDepth(const T&)
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -8,6 +8,158 @@
 #include "ck/utility/enable_if.hpp"

 namespace ck {
+#ifdef __HIPCC_RTC__
+template <bool B>
+using bool_constant = integral_constant<bool, B>;
+
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT1(name)         \
+    template <class T>                       \
+    struct name : bool_constant<__##name(T)> \
+    {                                        \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT2(name)            \
+    template <class T, class U>                 \
+    struct name : bool_constant<__##name(T, U)> \
+    {                                           \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAITN(name)             \
+    template <class... Ts>                       \
+    struct name : bool_constant<__##name(Ts...)> \
+    {                                            \
+    }
+
+CK_BUILTIN_TYPE_TRAIT1(is_class);
+CK_BUILTIN_TYPE_TRAIT1(is_pointer);
+CK_BUILTIN_TYPE_TRAIT1(is_reference);
+CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
+CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
+CK_BUILTIN_TYPE_TRAIT2(is_base_of);
+
+template <class T>
+struct remove_cv
+{
+    using type = T;
+};
+
+template <class T>
+struct remove_cv<const T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_cv<volatile T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_reference
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    typedef T type;
+};
+
+template <class T>
+struct remove_pointer
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T*>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* volatile>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const volatile>
+{
+    typedef T type;
+};
+
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+
+// TODO
+template<class T> struct is_const          : false_type {};
+template<class T> struct is_const<const T> : true_type {};
+template< class T >
+inline constexpr bool is_const_v = is_const<T>::value;
+
+template< class T >
+inline constexpr bool is_reference_v = is_reference<T>::value;
+
+template<class T> struct remove_const { typedef T type; };
+template<class T> struct remove_const<const T> { typedef T type; };
+template< class T >
+using remove_const_t = typename remove_const<T>::type;
+
+template< class T >
+inline constexpr bool is_class_v = is_class<T>::value;
+
+template< class T >
+inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+
+template< class... >
+using void_t = void;
+
+using __hip::declval;
+#else
+#include <utility>
+#include <type_traits>
+using std::forward;
+using std::is_base_of;
+using std::is_class;
+using std::is_pointer;
+using std::is_reference;
+using std::is_trivially_copyable;
+using std::is_unsigned;
+using std::remove_cv;
+using std::remove_pointer;
+using std::remove_reference;
+using std::is_const_v;
+using std::is_reference_v;
+using std::remove_const_t;
+using std::is_class_v;
+using std::is_trivially_copyable_v;
+using std::void_t;
+using std::false_type;
+using std::true_type;
+using std::declval;
+#endif

 template <typename X, typename Y>
 struct is_same : public integral_constant<bool, false>
@@ -23,19 +175,19 @@ template <typename X, typename Y>
 inline constexpr bool is_same_v = is_same<X, Y>::value;

 template <typename T>
-using remove_reference_t = typename std::remove_reference<T>::type;
+using remove_reference_t = typename remove_reference<T>::type;

 template <typename T>
-using remove_cv_t = typename std::remove_cv<T>::type;
+using remove_cv_t = typename remove_cv<T>::type;

 template <typename T>
-using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
+using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;

 template <typename T>
-using remove_pointer_t = typename std::remove_pointer<T>::type;
+using remove_pointer_t = typename remove_pointer<T>::type;

 template <typename T>
-inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
+inline constexpr bool is_pointer_v = is_pointer<T>::value;

 template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y bit_cast(const X& x)
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -17,10 +17,10 @@ namespace ck {
 // Convert X to Y, both X and Y are non-const data types.
 template <typename Y,
          typename X,
-          std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
+          ck::enable_if_t<!(ck::is_const_v<Y> || ck::is_const_v<X>), bool> = false>
 __host__ __device__ constexpr Y type_convert(X x)
 {
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+    static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);

    return static_cast<Y>(x);
 }
@@ -28,13 +28,13 @@ __host__ __device__ constexpr Y type_convert(X x)
 // Convert X to Y, either X or Y is a const data type.
 template <typename Y,
          typename X,
-          std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
+          ck::enable_if_t<ck::is_const_v<Y> || ck::is_const_v<X>, bool> = false>
 __host__ __device__ constexpr Y type_convert(X x)
 {
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+    static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);

-    using NonConstY = std::remove_const_t<Y>;
-    using NonConstX = std::remove_const_t<X>;
+    using NonConstY = ck::remove_const_t<Y>;
+    using NonConstX = ck::remove_const_t<X>;
    return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
 }

@@ -104,7 +104,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert_sp(X x)
 {
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+    static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);

    return static_cast<Y>(x);
 }
@@ -166,7 +166,7 @@ template <>
 inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
 {
    constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<long_index_t>(&x), x);
 #if defined(__gfx94__)
    union
    {
@@ -206,7 +206,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<long_index_t>(&x), x);
    return utils::
        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
            x, rng);
@@ -218,7 +218,7 @@ template <>
 inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
 {
    constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<long_index_t>(&x), x);
 #if defined(__gfx94__)
    union
    {
@@ -258,7 +258,7 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<long_index_t>(&x), x);
    return utils::
        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
            x, rng);
@@ -501,6 +501,7 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
 #endif
 }

+#ifndef __HIPCC_RTC__
 template <typename Y, typename X, std::size_t NumElems>
 inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
                                              const std::array<X, NumElems>& x)
@@ -510,6 +511,7 @@ inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
        y[i] = type_convert<Y>(x[i]);
    }
 }
+#endif

 template <typename Y, typename X, index_t NumElems>
 inline __host__ __device__ void array_convert(Array<Y, NumElems>& y, const Array<X, NumElems>& x)