mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 05:01:25 +00:00
Codegen hipRTC compilation (#1579)
* updating codegen build for MIOpen access: adding .cmake for codegen component * updating CMake * adding in header guards for some headers due to issues with hiprtc compilation in MIOpen * some more header guards * putting env file in header guard * cleaning up some includes * updated types file for hiprtc purposes * fixed types file: bit-wise/memcpy issue * updating multiple utility files to deal with standard header inclusion for hiprtc * added some more header guards in the utility files, replacing some standard header functionality * added some more header guards * fixing some conflicts in utility files, another round of header guards * fixing errors in data type file * resolved conflict errors in a few utility files * added header guards/replicated functionality in device files * resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd * resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp * added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp * fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl * replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector * resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit * added replicated functionality for standard header methods in utility files * replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp * temp fix for namespace error in MIOpen * remove standard header usage in codegen device op * removed standard header usage in elementwise files, resolved namespace errors * formatting fix * changed codegen argument to ON for testing * temporarily removing codegen compiler flag for testing purposes * added codegen flag again, set default to ON * set codegen flag default back to OFF * replaced enable_if_t standard header usage in data_type.hpp * added some debug prints to pinpoint issues in MIOpen * added print outs to debug in MIOpen * removed debug print outs from device op * resolved stdexcept include error * formatting fix * adding includes to new fp8 file to resolve ck::enable_if_t errors * made changes to amd_wave_read_first_lane * updated functionality in type utility file * fixed end of file issue * resovled errors in type utility file, added functionality to array utility file * fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x * formatting fix * replaced standard header usage in amd_ck_fp8 file * added include to random_gen file * removed and replicated standard header usage from data_type and type_convert files for fp8 changes * replicated standard unsigned integer types in random_gen * resolved comments from review: put calls to reinterpret_cast for size_t in header guards * updated/added copyright headers * removed duplicate header * fixed typo in header guard * updated copyright headers --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
#include <string>
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -18,6 +20,7 @@ enum struct ConvolutionForwardSpecialization
|
||||
Filter3x3,
|
||||
};
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
|
||||
{
|
||||
switch(s)
|
||||
@@ -30,6 +33,7 @@ inline std::string getConvForwardSpecializationString(const ConvolutionForwardSp
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <optional>
|
||||
|
||||
#include "ck/stream_config.hpp"
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
#define GET_OBJECT_NAME_IMLP \
|
||||
std::optional<std::string> GetObjectName() const override \
|
||||
{ \
|
||||
@@ -41,7 +43,9 @@ namespace device {
|
||||
}
|
||||
|
||||
#define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL
|
||||
#endif
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
struct BaseArgument
|
||||
{
|
||||
BaseArgument() = default;
|
||||
@@ -66,13 +70,14 @@ struct BaseInvoker
|
||||
|
||||
virtual ~BaseInvoker() {}
|
||||
};
|
||||
#endif
|
||||
|
||||
struct BaseOperator
|
||||
{
|
||||
BaseOperator() = default;
|
||||
BaseOperator(const BaseOperator&) = default;
|
||||
BaseOperator& operator=(const BaseOperator&) = default;
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
|
||||
virtual std::string GetTypeString() const { return ""; }
|
||||
|
||||
@@ -100,7 +105,7 @@ struct BaseOperator
|
||||
assert(p_arg);
|
||||
p_arg->p_workspace_ = p_workspace;
|
||||
}
|
||||
|
||||
#endif
|
||||
virtual ~BaseOperator() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
#include <array>
|
||||
#endif
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
|
||||
@@ -13,8 +15,13 @@ namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
#ifdef CK_CODE_GEN_RTC
|
||||
template <typename T>
|
||||
using is_tuple = decltype(ck::declval<T&>().IsTuple());
|
||||
#else
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \brief Grouped Convolution Forward
|
||||
@@ -72,12 +79,18 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator
|
||||
static constexpr index_t NumDTensor = DsDataType::Size();
|
||||
|
||||
static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
|
||||
|
||||
#ifdef CK_CODE_GEN_RTC
|
||||
using APointers = ck::conditional_t<isMultiA, ck::Array<const void*, NumATensor>&, const void*>;
|
||||
using BPointers = ck::conditional_t<isMultiB, ck::Array<const void*, NumBTensor>&, const void*>;
|
||||
#else
|
||||
// If DataType is tuple, user has to pass std::array with pointers.
|
||||
using APointers =
|
||||
std::conditional_t<isMultiA, std::array<const void*, NumATensor>&, const void*>;
|
||||
ck::conditional_t<isMultiA, std::array<const void*, NumATensor>&, const void*>;
|
||||
using BPointers =
|
||||
std::conditional_t<isMultiB, std::array<const void*, NumBTensor>&, const void*>;
|
||||
ck::conditional_t<isMultiB, std::array<const void*, NumBTensor>&, const void*>;
|
||||
#endif
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
|
||||
/**
|
||||
* \brief Make argument pointer for grouped conv fwd.
|
||||
@@ -150,6 +163,7 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator
|
||||
const CDEElementwiseOperation& cde_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -29,6 +29,7 @@ enum struct GemmSpecialization
|
||||
MNKOPadding,
|
||||
};
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
inline std::string getGemmSpecializationString(const GemmSpecialization& s)
|
||||
{
|
||||
switch(s)
|
||||
@@ -52,6 +53,7 @@ inline std::string getGemmSpecializationString(const GemmSpecialization& s)
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -3,11 +3,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#endif
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
@@ -15,15 +21,12 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/io.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -259,8 +262,13 @@ __global__ void
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifdef CK_CODE_GEN_RTC
|
||||
template <typename T>
|
||||
using is_tuple = decltype(ck::declval<T&>().IsTuple());
|
||||
#else
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
#endif
|
||||
|
||||
//
|
||||
// @brief Device Convolution operation.
|
||||
@@ -429,8 +437,8 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
|
||||
// If we are using multiAB and one of the template datatype parameters is not a tuple, convert
|
||||
// it to it
|
||||
using GemmADataType = std::conditional_t<!isMultiA && isMultiB, Tuple<ADataType>, ADataType>;
|
||||
using GemmBDataType = std::conditional_t<!isMultiB && isMultiA, Tuple<BDataType>, BDataType>;
|
||||
using GemmADataType = ck::conditional_t<!isMultiA && isMultiB, Tuple<ADataType>, ADataType>;
|
||||
using GemmBDataType = ck::conditional_t<!isMultiB && isMultiA, Tuple<BDataType>, BDataType>;
|
||||
|
||||
#define GridwiseGemmTemplateParameters \
|
||||
GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
|
||||
@@ -449,15 +457,13 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
CDEBlockTransferScalarPerVector_NPerBlock, LoopSched
|
||||
// Use appropriate gridwise gemm
|
||||
using GridwiseGemm =
|
||||
std::conditional_t<isMultiA || isMultiB,
|
||||
GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmTemplateParameters>,
|
||||
GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
|
||||
ck::conditional_t<isMultiA || isMultiB,
|
||||
GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmTemplateParameters>,
|
||||
GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
|
||||
|
||||
// If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers.
|
||||
using APointers =
|
||||
std::conditional_t<isMultiA, ck::Array<const void*, NumATensor>&, const void*>;
|
||||
using BPointers =
|
||||
std::conditional_t<isMultiB, ck::Array<const void*, NumBTensor>&, const void*>;
|
||||
using APointers = ck::conditional_t<isMultiA, ck::Array<const void*, NumATensor>&, const void*>;
|
||||
using BPointers = ck::conditional_t<isMultiB, ck::Array<const void*, NumBTensor>&, const void*>;
|
||||
// Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not
|
||||
// in initializer list what is required for single const pointer).
|
||||
using AGridPointer = remove_cvref_t<
|
||||
@@ -812,7 +818,6 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
|
||||
|
||||
// FIXME: layout
|
||||
if constexpr(is_same_v<DLayout, ctc::G_NW_K> || is_same_v<DLayout, ctc::G_NHW_K> ||
|
||||
is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
|
||||
@@ -965,18 +970,18 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
const BElementwiseOperation& b_element_op,
|
||||
const CDEElementwiseOperation& cde_element_op)
|
||||
{
|
||||
std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_i32;
|
||||
std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_i32;
|
||||
std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_i32;
|
||||
std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_i32;
|
||||
std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_i32;
|
||||
std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_i32;
|
||||
std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_i32;
|
||||
std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_i32;
|
||||
std::array<index_t, NDimSpatial> conv_filter_strides_i32;
|
||||
std::array<index_t, NDimSpatial> conv_filter_dilations_i32;
|
||||
std::array<index_t, NDimSpatial> input_left_pads_i32;
|
||||
std::array<index_t, NDimSpatial> input_right_pads_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_i32;
|
||||
ck::Array<ck::Array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_i32;
|
||||
ck::Array<ck::Array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_i32;
|
||||
ck::Array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_i32;
|
||||
ck::Array<index_t, NDimSpatial> conv_filter_strides_i32;
|
||||
ck::Array<index_t, NDimSpatial> conv_filter_dilations_i32;
|
||||
ck::Array<index_t, NDimSpatial> input_left_pads_i32;
|
||||
ck::Array<index_t, NDimSpatial> input_right_pads_i32;
|
||||
|
||||
array_convert(a_g_n_c_wis_lengths_i32, a_g_n_c_wis_lengths);
|
||||
array_convert(a_g_n_c_wis_strides_i32, a_g_n_c_wis_strides);
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
|
||||
|
||||
@@ -205,8 +205,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
|
||||
|
||||
const auto b2c_map = DefaultBlock2CTileMap{};
|
||||
index_t gdx, gdy, gdz;
|
||||
std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
|
||||
const auto K0Padded = karg.K0Padded;
|
||||
ck::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
|
||||
const auto K0Padded = karg.K0Padded;
|
||||
|
||||
const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0Padded);
|
||||
|
||||
|
||||
@@ -183,8 +183,8 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
|
||||
|
||||
const auto b2c_map = DefaultBlock2CTileMap{};
|
||||
index_t gdx, gdy, gdz;
|
||||
std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
|
||||
const auto K0Padded = karg.K0Padded;
|
||||
ck::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
|
||||
const auto K0Padded = karg.K0Padded;
|
||||
|
||||
const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0Padded);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
@@ -212,9 +213,13 @@ __global__ void
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifdef CK_CODE_GEN_RTC
|
||||
template <typename T>
|
||||
using is_tuple = decltype(ck::declval<T&>().IsTuple());
|
||||
#else
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
#endif
|
||||
|
||||
//
|
||||
// @brief Device Convolution operation.
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
|
||||
|
||||
@@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout
|
||||
|
||||
} // namespace convolution
|
||||
|
||||
#ifndef CK_CODE_GEN_RTC
|
||||
template <
|
||||
typename Layout,
|
||||
typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
|
||||
@@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
|
||||
os << Layout::name;
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tensor_layout
|
||||
} // namespace ck
|
||||
|
||||
Reference in New Issue
Block a user