Rebase the PR #1520 to ROCm repo. (#1574)

* Implement hiprtc for codegen tests

* Introduce gemm_softmax_gemm to codegen.

* Fix codegen build issues.

* Address PR comments.

* Separate ck_host lib and gemm_softmax_gemm into different PR.

* Fix cmake.

* Replace ENV variable with CMake option for toggling hipRTC in codegen
tests.

* Address PR comments.

* fix clang format

* Add missing header in magic_division.hpp

* - Workaround for hipRTC content wrapper
- Move descriptor for gemm_softmax_gemm to different branch

* Fix formatting.

* Revert "Fix formatting."

This reverts commit b5209eaef4.

* formatting fix

* fixed header guard issues

* updated header guards

* updated data_type for new types

* fixed redefinition error

* Add codegen test for batched_gemm_softmax_gemm.

Signed-off-by: Mirza Halilcevic <mirza.halilcevic@amd.com>

* formatting fix

---------

Signed-off-by: Mirza Halilcevic <mirza.halilcevic@amd.com>
Co-authored-by: Dino Musić <dino.music@htecgroup.com>
Co-authored-by: Mirza Halilcevic <mirza.halilcevic@htecgroup.com>
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
Co-authored-by: arai713 <67439843+arai713@users.noreply.github.com>
Co-authored-by: Astha Rai <astha.rai713@gmail.com>
Co-authored-by: Mirza Halilcevic <mirza.halilcevic@amd.com>
This commit is contained in:
Illia Silin
2025-02-20 18:58:14 -08:00
committed by GitHub
parent c6d29bcd2c
commit 68a08c872e
32 changed files with 880 additions and 517 deletions

View File

@@ -3,11 +3,12 @@
#pragma once
#ifndef CK_CODE_GEN_RTC
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
#include <string>
#include <sstream>
#include <regex>
#include <optional>
#include "ck/stream_config.hpp"
#endif
@@ -15,7 +16,7 @@ namespace ck {
namespace tensor_operation {
namespace device {
#ifndef CK_CODE_GEN_RTC
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
#define GET_OBJECT_NAME_IMLP \
std::optional<std::string> GetObjectName() const override \
{ \
@@ -77,7 +78,7 @@ struct BaseOperator
BaseOperator() = default;
BaseOperator(const BaseOperator&) = default;
BaseOperator& operator=(const BaseOperator&) = default;
#ifndef CK_CODE_GEN_RTC
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
virtual std::string GetTypeString() const { return ""; }

View File

@@ -2,9 +2,10 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#ifndef __HIPCC_RTC__
#include <iostream>
#include <vector>
#endif
#include "device_base.hpp"
@@ -28,6 +29,7 @@ template <typename ALayout,
bool MaskOutUpperTriangle> // TODO: enum for mask type
struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
{
#ifndef __HIPCC_RTC__
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_a,
const void* p_b0,
@@ -53,6 +55,7 @@ struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
CElementwiseOperation c_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
#endif
};
} // namespace device

View File

@@ -2,9 +2,11 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#ifndef __HIPCC_RTC__
#include <array>
#endif
#include "ck/utility/array.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
@@ -34,6 +36,7 @@ struct DeviceGemmMultipleD : public BaseOperator
{
static constexpr index_t NumDTensor = DsDataType::Size();
#ifndef __HIPCC_RTC__
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_a,
const void* p_b,
@@ -51,6 +54,7 @@ struct DeviceGemmMultipleD : public BaseOperator
CDEElementwiseOperation cde_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
#endif
};
// GEMM:
@@ -76,6 +80,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
{
static constexpr index_t NumDTensor = DsDataType::Size();
#ifndef __HIPCC_RTC__
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_a,
const void* p_b,
@@ -94,6 +99,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
CDEElementwiseOperation cde_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
#endif
};
// GEMM:

View File

@@ -28,8 +28,7 @@ enum struct GemmSpecialization
NKOPadding,
MNKOPadding,
};
#ifndef CK_CODE_GEN_RTC
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
inline std::string getGemmSpecializationString(const GemmSpecialization& s)
{
switch(s)

View File

@@ -3,8 +3,12 @@
#pragma once
#ifndef __HIPCC_RTC__
#include <iostream>
#include <sstream>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#endif
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
@@ -15,8 +19,6 @@
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace ck {
namespace tensor_operation {
@@ -429,6 +431,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
matrix_padder.PadN,
MaskOutUpperTriangle>;
#ifndef __HIPCC_RTC__
// Argument
struct Argument : public BaseArgument
{
@@ -603,6 +606,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
#endif
static constexpr bool IsValidCompilationParameter()
{
@@ -610,6 +614,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
return true;
}
#ifndef __HIPCC_RTC__
static constexpr bool
IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
{
@@ -837,6 +842,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
return str.str();
}
#endif
template <class ADesc, class BDesc, class B1Desc, class CDesc>
struct Descriptor

View File

@@ -3,8 +3,12 @@
#pragma once
#ifndef __HIPCC_RTC__
#include <iostream>
#include <sstream>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#endif
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
@@ -14,8 +18,6 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace ck {
@@ -224,9 +226,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
}
static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
const std::array<index_t, NumDTensor>& NRaws,
const std::array<index_t, NumDTensor>& DsStride)
static auto MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
const Array<index_t, NumDTensor>& NRaws,
const Array<index_t, NumDTensor>& DsStride)
{
return generate_tuple(
[&](auto i) {
@@ -308,6 +310,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
using Block2ETileMap =
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
#ifndef __HIPCC_RTC__
// Argument
struct Argument : public BaseArgument
{
@@ -497,6 +500,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
}
};
#endif
static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
{
// check vector load/store
@@ -577,6 +582,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return true;
}
#ifndef __HIPCC_RTC__
static bool IsSupportedArgument(const Argument& arg)
{
if(!ck::is_xdl_supported())
@@ -675,11 +681,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
{
auto str = std::stringstream();
std::map<LoopScheduler, std::string> LoopSchedToString{
{LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
std::map<LoopScheduler, std::string> LoopSchedToString{{LoopScheduler::Default, "Default"},
{ LoopScheduler::Interwave,
"Interwave" }};
std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
{PipelineVersion::v2, "v2"}};
{ PipelineVersion::v2,
"v2" }};
// clang-format off
str << "DeviceGemmMultipleD_Xdl_CShuffle"
@@ -708,6 +716,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return str.str();
}
#endif
template <class ADesc, class BDesc, class DsDesc, class EDesc>
struct Descriptor
@@ -846,7 +855,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
EDataType* __restrict__ p_e_grid)
{
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
#ifndef __HIPCC_RTC__
assert(desc.IsValid());
#endif
if(desc.has_main_k_block_loop)
{
GridwiseGemm::template Run<true>(p_a_grid,

View File

@@ -13,6 +13,7 @@ enum struct MaskingSpecialization
MaskOutUpperTriangle
};
#ifndef __HIPCC_RTC__
inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
{
switch(s)
@@ -22,6 +23,7 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
default: return "Unrecognized specialization!";
}
}
#endif
struct MaskDisabledPredicate
{
@@ -53,7 +55,7 @@ struct MaskOutUpperTrianglePredicate
template <typename MaskOutPredicate>
struct C0MatrixMask_impl
{
__host__ __device__ C0MatrixMask_impl(index_t NRaw)
__host__ __device__ constexpr C0MatrixMask_impl(index_t NRaw)
: NRaw_(NRaw), predicate_(MaskOutPredicate{})
{
}

View File

@@ -436,7 +436,7 @@ struct G_NDHW : public BaseTensorLayout
} // namespace convolution
#ifndef CK_CODE_GEN_RTC
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
template <
typename Layout,
typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>