Codegen hipRTC compilation (#1579)

* updating codegen build for MIOpen access: adding .cmake for codegen component * updating CMake * adding in header guards for some headers due to issues with hiprtc compilation in MIOpen * some more header guards * putting env file in header guard * cleaning up some includes * updated types file for hiprtc purposes * fixed types file: bit-wise/memcpy issue * updating multiple utility files to deal with standard header inclusion for hiprtc * added some more header guards in the utility files, replacing some standard header functionality * added some more header guards * fixing some conflicts in utility files, another round of header guards * fixing errors in data type file * resolved conflict errors in a few utility files * added header guards/replicated functionality in device files * resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd * resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp * added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp * fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl * replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector * resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit * added replicated functionality for standard header methods in utility files * replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp * temp fix for namespace error in MIOpen * remove standard header usage in codegen device op * removed standard header usage in elementwise files, resolved namespace errors * formatting fix * changed codegen argument to ON for testing * temporarily removing codegen compiler flag for testing purposes * added codegen flag again, set default to ON * set codegen flag default back to OFF * replaced enable_if_t standard header usage in data_type.hpp * added some debug prints to pinpoint issues in MIOpen * added print outs to debug in MIOpen * removed debug print outs from device op * resolved stdexcept include error * formatting fix * adding includes to new fp8 file to resolve ck::enable_if_t errors * made changes to amd_wave_read_first_lane * updated functionality in type utility file * fixed end of file issue * resovled errors in type utility file, added functionality to array utility file * fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x * formatting fix * replaced standard header usage in amd_ck_fp8 file * added include to random_gen file * removed and replicated standard header usage from data_type and type_convert files for fp8 changes * replicated standard unsigned integer types in random_gen * resolved comments from review: put calls to reinterpret_cast for size_t in header guards * updated/added copyright headers * removed duplicate header * fixed typo in header guard * updated copyright headers --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2026-05-03 05:01:25 +00:00 · 2025-01-31 09:48:39 -08:00
parent 2ab8bf4c12
commit 2e3183af4f
65 changed files with 1119 additions and 385 deletions
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1,14 +1,17 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck/utility/math.hpp"
 #include "ck/utility/number.hpp"
+#include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
+#ifndef CK_CODE_GEN_RTC
 #include <limits>
 #include <stdlib.h>
+#endif

 namespace ck {

@@ -978,8 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit
        // Create 3D grid
        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
-
-        return std::make_tuple(N0, M0, k_split);
+        return make_tuple(N0, M0, k_split);
    }

    template <typename TopIdx>
@@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK
            uint32_t dp_for_sk_iters = k_iters_per_tile.get();

            uint32_t best_sk_score =
-                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+                NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
                tentative_sk_blocks++)
            {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -423,10 +423,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
    }

    template <typename AsLayout, GemmSpecialization GemmSpec>
-    __host__ __device__ static auto
-    MakeAsGridDescriptor_M_K(const std::array<index_t, NumATensor>& MRaws,
-                             const std::array<index_t, NumATensor>& KRaws,
-                             const std::array<index_t, NumATensor>& AsStride)
+    __host__ __device__ static auto MakeAsGridDescriptor_M_K(
+#ifdef CK_CODE_GEN_RTC
+        const ck::Array<index_t, NumATensor>& MRaws,
+        const ck::Array<index_t, NumATensor>& KRaws,
+        const ck::Array<index_t, NumATensor>& AsStride
+#else
+        const std::array<index_t, NumATensor>& MRaws,
+        const std::array<index_t, NumATensor>& KRaws,
+        const std::array<index_t, NumATensor>& AsStride
+#endif
+    )
    {
        return generate_tuple(
            [&](auto i) {
@@ -462,10 +469,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
    }

    template <typename BsLayout, GemmSpecialization GemmSpec>
-    __host__ __device__ static auto
-    MakeBsGridDescriptor_N_K(const std::array<index_t, NumBTensor>& NRaws,
-                             const std::array<index_t, NumBTensor>& KRaws,
-                             const std::array<index_t, NumBTensor>& BsStride)
+    __host__ __device__ static auto MakeBsGridDescriptor_N_K(
+#ifdef CK_CODE_GEN_RTC
+        const ck::Array<index_t, NumBTensor>& NRaws,
+        const ck::Array<index_t, NumBTensor>& KRaws,
+        const ck::Array<index_t, NumBTensor>& BsStride
+#else
+        const std::array<index_t, NumBTensor>& NRaws,
+        const std::array<index_t, NumBTensor>& KRaws,
+        const std::array<index_t, NumBTensor>& BsStride
+#endif
+    )
    {
        return generate_tuple(
            [&](auto i) {
@@ -500,10 +514,17 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
    }

    template <typename DsLayout, GemmSpecialization GemmSpec>
-    __host__ __device__ static auto
-    MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                             const std::array<index_t, NumDTensor>& NRaws,
-                             const std::array<index_t, NumDTensor>& DsStride)
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+#ifdef CK_CODE_GEN_RTC
+        const ck::Array<index_t, NumDTensor>& MRaws,
+        const ck::Array<index_t, NumDTensor>& NRaws,
+        const ck::Array<index_t, NumDTensor>& DsStride
+#else
+        const std::array<index_t, NumDTensor>& MRaws,
+        const std::array<index_t, NumDTensor>& NRaws,
+        const std::array<index_t, NumDTensor>& DsStride
+#endif
+    )
    {
        return generate_tuple(
            [&](auto i) {
@@ -969,9 +990,15 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
                               const index_t M,
                               const index_t N,
                               const index_t K,
+#ifdef CK_CODE_GEN_RTC
+                               const ck::Array<index_t, NumATensor> StrideAs,
+                               const ck::Array<index_t, NumBTensor> StrideBs,
+                               const ck::Array<index_t, NumDTensor> StrideDs,
+#else
                               const std::array<index_t, NumATensor> StrideAs,
                               const std::array<index_t, NumBTensor> StrideBs,
                               const std::array<index_t, NumDTensor> StrideDs,
+#endif
                               const index_t StrideE,
                               const Block2ETileMap& block_2_etile_map)
    {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -473,11 +473,19 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
    }

+#ifdef CK_CODE_GEN_RTC
+    template <typename DsLayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeDsGridDescriptor_M_N(const ck::Array<index_t, NumDTensor>& MRaws,
+                             const ck::Array<index_t, NumDTensor>& NRaws,
+                             const ck::Array<index_t, NumDTensor>& DsStride)
+#else
    template <typename DsLayout, GemmSpecialization GemmSpec>
    __host__ __device__ static auto
    MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
                             const std::array<index_t, NumDTensor>& NRaws,
                             const std::array<index_t, NumDTensor>& DsStride)
+#endif
    {
        return generate_tuple(
            [&](auto i) {
@@ -941,7 +949,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                               const index_t K,
                               const index_t StrideA,
                               const index_t StrideB,
+#ifdef CK_CODE_GEN_RTC
+                               const ck::Array<index_t, NumDTensor> StrideDs,
+#else
                               const std::array<index_t, NumDTensor> StrideDs,
+#endif
                               const index_t StrideE,
                               const Block2ETileMap& block_2_etile_map)
    {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef CK_CODE_GEN_RTC
 #include <iostream>
 #include <ostream>
+#endif

 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
@@ -53,12 +54,15 @@ constexpr auto GridwiseGemmPipeline_Selector()
    }
    else
    {
+#ifndef CK_CODE_GEN_RTC
        std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
+#endif
    }
 }

 } // namespace ck

+#ifndef CK_CODE_GEN_RTC
 inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
 {
    switch(p)
@@ -71,3 +75,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
    }
    return os;
 }
+#endif