mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-06 15:54:31 +00:00
Introduce gemm_softmax_gemm to codegen.
This commit is contained in:
@@ -4,16 +4,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/config.h"
|
||||
#include "ck/utility/env.hpp"
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#endif
|
||||
|
||||
// environment variable to enable logging:
|
||||
// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
|
||||
CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
|
||||
#endif
|
||||
|
||||
// to do: add various levels of logging with CK_LOG_LEVEL
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <hip/hip_runtime.h>
|
||||
@@ -96,3 +97,4 @@ inline bool is_gfx12_supported()
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
@@ -160,3 +160,4 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,15 +3,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/stream_config.hpp"
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
struct BaseArgument
|
||||
{
|
||||
BaseArgument() = default;
|
||||
@@ -36,6 +38,7 @@ struct BaseInvoker
|
||||
|
||||
virtual ~BaseInvoker() {}
|
||||
};
|
||||
#endif
|
||||
|
||||
struct BaseOperator
|
||||
{
|
||||
@@ -43,6 +46,7 @@ struct BaseOperator
|
||||
BaseOperator(const BaseOperator&) = default;
|
||||
BaseOperator& operator=(const BaseOperator&) = default;
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
|
||||
virtual std::string GetTypeString() const { return ""; }
|
||||
|
||||
@@ -66,7 +70,7 @@ struct BaseOperator
|
||||
assert(p_arg);
|
||||
p_arg->p_workspace_ = p_workspace;
|
||||
}
|
||||
|
||||
#endif
|
||||
virtual ~BaseOperator() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#endif
|
||||
|
||||
#include "device_base.hpp"
|
||||
|
||||
@@ -28,6 +29,7 @@ template <typename ALayout,
|
||||
bool MaskOutUpperTriangle> // TODO: enum for mask type
|
||||
struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
|
||||
{
|
||||
#ifndef __HIPCC_RTC__
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_a,
|
||||
const void* p_b0,
|
||||
@@ -53,6 +55,7 @@ struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
|
||||
CElementwiseOperation c_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
|
||||
@@ -2,9 +2,11 @@
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <array>
|
||||
#endif
|
||||
|
||||
#include "ck/utility/array.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -34,6 +36,7 @@ struct DeviceGemmMultipleD : public BaseOperator
|
||||
{
|
||||
static constexpr index_t NumDTensor = DsDataType::Size();
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_a,
|
||||
const void* p_b,
|
||||
@@ -51,6 +54,7 @@ struct DeviceGemmMultipleD : public BaseOperator
|
||||
CDEElementwiseOperation cde_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
// GEMM:
|
||||
@@ -76,6 +80,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
|
||||
{
|
||||
static constexpr index_t NumDTensor = DsDataType::Size();
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_a,
|
||||
const void* p_b,
|
||||
@@ -94,6 +99,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
|
||||
CDEElementwiseOperation cde_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
|
||||
@@ -28,7 +28,7 @@ enum struct GemmSpecialization
|
||||
NKOPadding,
|
||||
MNKOPadding,
|
||||
};
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
inline std::string getGemmSpecializationString(const GemmSpecialization& s)
|
||||
{
|
||||
switch(s)
|
||||
@@ -52,6 +52,7 @@ inline std::string getGemmSpecializationString(const GemmSpecialization& s)
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -3,8 +3,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#endif
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
@@ -15,8 +19,6 @@
|
||||
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -40,27 +42,27 @@ template <typename GridwiseGemm,
|
||||
bool HasMainKBlockLoop>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
|
||||
const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
const FloatAB* __restrict__ p_b1_grid,
|
||||
FloatC* __restrict__ p_c_grid,
|
||||
const AElementwiseOperation a_element_op,
|
||||
const BElementwiseOperation b_element_op,
|
||||
const AccElementwiseOperation acc_element_op,
|
||||
const B1ElementwiseOperation b1_element_op,
|
||||
const CElementwiseOperation c_element_op,
|
||||
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
|
||||
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
|
||||
const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
|
||||
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
c_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const Block2CTileMap block_2_ctile_map,
|
||||
const index_t batch_count,
|
||||
const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
|
||||
const C0MatrixMask c0_matrix_mask)
|
||||
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
|
||||
const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
const FloatAB* __restrict__ p_b1_grid,
|
||||
FloatC* __restrict__ p_c_grid,
|
||||
const AElementwiseOperation a_element_op,
|
||||
const BElementwiseOperation b_element_op,
|
||||
const AccElementwiseOperation acc_element_op,
|
||||
const B1ElementwiseOperation b1_element_op,
|
||||
const CElementwiseOperation c_element_op,
|
||||
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
|
||||
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
|
||||
const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
|
||||
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
c_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const Block2CTileMap block_2_ctile_map,
|
||||
const index_t batch_count,
|
||||
const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
|
||||
const C0MatrixMask c0_matrix_mask)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
|
||||
defined(__gfx94__))
|
||||
@@ -430,6 +432,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
|
||||
matrix_padder.PadN,
|
||||
MaskOutUpperTriangle>;
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
// Argument
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
@@ -604,6 +607,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
static constexpr bool IsValidCompilationParameter()
|
||||
{
|
||||
@@ -611,6 +615,97 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
|
||||
return true;
|
||||
}
|
||||
|
||||
static constexpr bool
|
||||
IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
|
||||
{
|
||||
// check vector load/store
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
// check vector load of A
|
||||
if constexpr(is_same_v<ALayout, Row>)
|
||||
{
|
||||
if(KRaw_ % ABlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<ALayout, Col>)
|
||||
{
|
||||
if(MRaw_ % ABlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check vector load of B
|
||||
if constexpr(is_same_v<BLayout, Row>)
|
||||
{
|
||||
if(NRaw_ % BBlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<BLayout, Col>)
|
||||
{
|
||||
if(KRaw_ % BBlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check vector load of B1
|
||||
if constexpr(is_same_v<B1Layout, Row>)
|
||||
{
|
||||
if(Gemm1NRaw_ % B1BlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<B1Layout, Col>)
|
||||
{
|
||||
if(NRaw_ % B1BlockTransferSrcScalarPerVector != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check vector load of C
|
||||
if constexpr(is_same_v<CLayout, Row>)
|
||||
{
|
||||
if(Gemm1NRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<CLayout, Col>)
|
||||
{
|
||||
if(MRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
if(!ck::is_xdl_supported())
|
||||
@@ -765,8 +860,271 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
|
||||
|
||||
return str.str();
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class ADesc, class BDesc, class B1Desc, class CDesc>
|
||||
struct Descriptor
|
||||
{
|
||||
template <class AGridDescriptor>
|
||||
static constexpr auto MakeAGridDescriptor_AK0_M_AK1(const AGridDescriptor& a_grid_desc)
|
||||
{
|
||||
const auto a_grid_desc_m_k = DeviceOp::matrix_padder.PadADescriptor_M_K(a_grid_desc);
|
||||
|
||||
const auto M = a_grid_desc_m_k.GetLength(I0);
|
||||
const auto K = a_grid_desc_m_k.GetLength(I1);
|
||||
|
||||
const auto AK0 = K / AK1;
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
a_grid_desc_m_k,
|
||||
make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
|
||||
make_pass_through_transform(M)),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
}
|
||||
|
||||
template <class BGridDescriptor>
|
||||
static constexpr auto MakeBGridDescriptor_BK0_N_BK1(const BGridDescriptor& b_grid_desc)
|
||||
{
|
||||
const auto b_grid_desc_n_k = DeviceOp::matrix_padder.PadBDescriptor_N_K(b_grid_desc);
|
||||
|
||||
const auto N = b_grid_desc_n_k.GetLength(I0);
|
||||
const auto K = b_grid_desc_n_k.GetLength(I1);
|
||||
|
||||
const auto BK0 = K / BK1;
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
b_grid_desc_n_k,
|
||||
make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
|
||||
make_pass_through_transform(N)),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
}
|
||||
|
||||
template <class B1GridDescriptor>
|
||||
static constexpr auto MakeB1GridDescriptor_BK0_N_BK1(const B1GridDescriptor& b1_grid_desc)
|
||||
{
|
||||
const auto b1_grid_desc_n_k = DeviceOp::matrix_padder.PadB1Descriptor_N_K(b1_grid_desc);
|
||||
|
||||
const auto N = b1_grid_desc_n_k.GetLength(I0);
|
||||
const auto K = b1_grid_desc_n_k.GetLength(I1);
|
||||
|
||||
const auto B1K0 = K / B1K1;
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
b1_grid_desc_n_k,
|
||||
make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
|
||||
make_pass_through_transform(N)),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
}
|
||||
|
||||
template <class CGridDescriptor>
|
||||
static constexpr auto MakeCGridDescriptor_M_N(const CGridDescriptor& c_grid_desc)
|
||||
{
|
||||
return DeviceOp::matrix_padder.PadCDescriptor_M_N(c_grid_desc);
|
||||
}
|
||||
|
||||
using AGridDesc_AK0_M_AK1 =
|
||||
remove_cvref_t<decltype(MakeAGridDescriptor_AK0_M_AK1(ADesc{}))>;
|
||||
using BGridDesc_BK0_N_BK1 =
|
||||
remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1(BDesc{}))>;
|
||||
using B1GridDesc_BK0_N_BK1 =
|
||||
remove_cvref_t<decltype(MakeB1GridDescriptor_BK0_N_BK1(B1Desc{}))>;
|
||||
using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(CDesc{}))>;
|
||||
|
||||
// GridwiseGemm
|
||||
using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
|
||||
ADataType, // TODO: distinguish A/B datatype
|
||||
GemmAccDataType,
|
||||
CShuffleDataType,
|
||||
CDataType,
|
||||
AElementwiseOperation,
|
||||
BElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
B1ElementwiseOperation,
|
||||
CElementwiseOperation,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
AGridDesc_AK0_M_AK1,
|
||||
BGridDesc_BK0_N_BK1,
|
||||
B1GridDesc_BK0_N_BK1,
|
||||
CGridDesc_M_N,
|
||||
NumGemmKPrefetchStage,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
Gemm1NPerBlock,
|
||||
Gemm1KPerBlock,
|
||||
AK1,
|
||||
BK1,
|
||||
B1K1,
|
||||
MPerXDL,
|
||||
NPerXDL,
|
||||
MXdlPerWave,
|
||||
NXdlPerWave,
|
||||
Gemm1NXdlPerWave,
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1,
|
||||
ABlockTransferThreadClusterArrangeOrder,
|
||||
ABlockTransferSrcAccessOrder,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_AK1,
|
||||
true,
|
||||
ABlockLdsExtraM,
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1,
|
||||
BBlockTransferThreadClusterArrangeOrder,
|
||||
BBlockTransferSrcAccessOrder,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_BK1,
|
||||
true,
|
||||
BBlockLdsExtraN,
|
||||
B1BlockTransferThreadClusterLengths_BK0_N_BK1,
|
||||
B1BlockTransferThreadClusterArrangeOrder,
|
||||
B1BlockTransferSrcAccessOrder,
|
||||
B1BlockTransferSrcVectorDim,
|
||||
B1BlockTransferSrcScalarPerVector,
|
||||
B1BlockTransferDstScalarPerVector_BK1,
|
||||
false,
|
||||
B1BlockLdsExtraN,
|
||||
CShuffleMXdlPerWavePerShuffle,
|
||||
CShuffleNXdlPerWavePerShuffle,
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
LoopSched,
|
||||
matrix_padder.PadN,
|
||||
MaskOutUpperTriangle>;
|
||||
|
||||
AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1;
|
||||
BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1;
|
||||
B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1;
|
||||
CGridDesc_M_N c_grid_desc_m_n;
|
||||
C0MatrixMask c0_matrix_mask;
|
||||
typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map;
|
||||
typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
c_grid_descriptor_mblock_mperblock_nblock_nperblock;
|
||||
|
||||
// element-wise op
|
||||
AElementwiseOperation a_element_op;
|
||||
BElementwiseOperation b_element_op;
|
||||
B1ElementwiseOperation b1_element_op;
|
||||
CElementwiseOperation c_element_op;
|
||||
|
||||
bool has_main_k_block_loop = true;
|
||||
bool is_valid = false;
|
||||
|
||||
constexpr Descriptor(ADesc a,
|
||||
BDesc b,
|
||||
B1Desc b1,
|
||||
CDesc c,
|
||||
AElementwiseOperation a_element_op_,
|
||||
BElementwiseOperation b_element_op_,
|
||||
B1ElementwiseOperation b1_element_op_,
|
||||
CElementwiseOperation c_element_op_)
|
||||
: a_grid_desc_ak0_m_ak1{MakeAGridDescriptor_AK0_M_AK1(a)},
|
||||
b_grid_desc_bk0_n_bk1{MakeBGridDescriptor_BK0_N_BK1(b)},
|
||||
b1_grid_desc_bk0_n_bk1{MakeB1GridDescriptor_BK0_N_BK1(b1)},
|
||||
c_grid_desc_m_n{MakeCGridDescriptor_M_N(c)},
|
||||
block_2_ctile_map{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n)},
|
||||
c_grid_descriptor_mblock_mperblock_nblock_nperblock{
|
||||
GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
c_grid_desc_m_n)},
|
||||
has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
|
||||
a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
|
||||
c0_matrix_mask{c.GetLength(I1)},
|
||||
a_element_op{a_element_op_},
|
||||
b_element_op{b_element_op_},
|
||||
b1_element_op{b1_element_op_},
|
||||
c_element_op{c_element_op_},
|
||||
is_valid{GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1,
|
||||
b_grid_desc_bk0_n_bk1,
|
||||
b1_grid_desc_bk0_n_bk1,
|
||||
c_grid_desc_m_n,
|
||||
block_2_ctile_map) and
|
||||
IsSupported(a_grid_desc_ak0_m_ak1.GetLength(I1),
|
||||
b_grid_desc_bk0_n_bk1.GetLength(I1),
|
||||
a_grid_desc_ak0_m_ak1.GetLength(I0) *
|
||||
a_grid_desc_ak0_m_ak1.GetLength(I2),
|
||||
b1_grid_desc_bk0_n_bk1.GetLength(I1))}
|
||||
{
|
||||
}
|
||||
|
||||
constexpr bool IsValid() const { return is_valid; }
|
||||
};
|
||||
|
||||
template <class ADesc, class BDesc, class B1Desc, class CDesc>
|
||||
static constexpr auto
|
||||
make_descriptor(ADesc a,
|
||||
BDesc b,
|
||||
B1Desc b1,
|
||||
CDesc c,
|
||||
AElementwiseOperation a_element_op = AElementwiseOperation{},
|
||||
BElementwiseOperation b_element_op = BElementwiseOperation{},
|
||||
B1ElementwiseOperation b1_element_op = B1ElementwiseOperation{},
|
||||
CElementwiseOperation c_element_op = CElementwiseOperation{})
|
||||
{
|
||||
return Descriptor<ADesc, BDesc, B1Desc, CDesc>(
|
||||
a, b, b1, c, a_element_op, b_element_op, b1_element_op, c_element_op);
|
||||
}
|
||||
|
||||
template <class Desc>
|
||||
__device__ static void Run(const Desc& desc,
|
||||
const float scale,
|
||||
const ADataType* __restrict__ p_a_grid,
|
||||
const ADataType* __restrict__ p_b_grid,
|
||||
const ADataType* __restrict__ p_b1_grid,
|
||||
CDataType* __restrict__ p_c_grid)
|
||||
{
|
||||
#ifndef __HIPCC_RTC__
|
||||
assert(desc.is_valid);
|
||||
#endif
|
||||
__shared__ char p_shared_block[Desc::GridwiseGemm::GetSharedMemoryNumberOfByte()];
|
||||
AccElementwiseOperation acc_element_op{scale};
|
||||
|
||||
if(desc.has_main_k_block_loop)
|
||||
{
|
||||
Desc::GridwiseGemm::template Run<true>(
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_b1_grid,
|
||||
p_c_grid,
|
||||
p_shared_block,
|
||||
desc.a_element_op,
|
||||
desc.b_element_op,
|
||||
acc_element_op,
|
||||
desc.b1_element_op,
|
||||
desc.c_element_op,
|
||||
desc.a_grid_desc_ak0_m_ak1,
|
||||
desc.b_grid_desc_bk0_n_bk1,
|
||||
desc.b1_grid_desc_bk0_n_bk1,
|
||||
desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
|
||||
desc.block_2_ctile_map,
|
||||
desc.c0_matrix_mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
Desc::GridwiseGemm::template Run<false>(
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_b1_grid,
|
||||
p_c_grid,
|
||||
p_shared_block,
|
||||
desc.a_element_op,
|
||||
desc.b_element_op,
|
||||
acc_element_op,
|
||||
desc.b1_element_op,
|
||||
desc.c_element_op,
|
||||
desc.a_grid_desc_ak0_m_ak1,
|
||||
desc.b_grid_desc_bk0_n_bk1,
|
||||
desc.b1_grid_desc_bk0_n_bk1,
|
||||
desc.c_grid_descriptor_mblock_mperblock_nblock_nperblock,
|
||||
desc.block_2_ctile_map,
|
||||
desc.c0_matrix_mask);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
} // namespace ck
|
||||
@@ -3,8 +3,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#endif
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
@@ -14,8 +18,6 @@
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -35,22 +37,22 @@ template <typename GridwiseGemm,
|
||||
bool HasMainKBlockLoop>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
|
||||
const BDataType* __restrict__ p_b_grid,
|
||||
DsPointer p_ds_grid,
|
||||
EDataType* __restrict__ p_e_grid,
|
||||
const AElementwiseOperation a_element_op,
|
||||
const BElementwiseOperation b_element_op,
|
||||
const CDEElementwiseOperation cde_element_op,
|
||||
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
|
||||
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
|
||||
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
ds_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
e_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const Block2ETileMap block_2_etile_map)
|
||||
kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
|
||||
const BDataType* __restrict__ p_b_grid,
|
||||
DsPointer p_ds_grid,
|
||||
EDataType* __restrict__ p_e_grid,
|
||||
const AElementwiseOperation a_element_op,
|
||||
const BElementwiseOperation b_element_op,
|
||||
const CDEElementwiseOperation cde_element_op,
|
||||
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
|
||||
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
|
||||
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
ds_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
e_grid_desc_mblock_mperblock_nblock_nperblock,
|
||||
const Block2ETileMap block_2_etile_map)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
|
||||
defined(__gfx94__))
|
||||
@@ -225,9 +227,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
|
||||
}
|
||||
|
||||
static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
|
||||
const std::array<index_t, NumDTensor>& NRaws,
|
||||
const std::array<index_t, NumDTensor>& DsStride)
|
||||
static auto MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
|
||||
const Array<index_t, NumDTensor>& NRaws,
|
||||
const Array<index_t, NumDTensor>& DsStride)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
@@ -309,6 +311,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
using Block2ETileMap =
|
||||
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
// Argument
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
@@ -498,6 +501,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
|
||||
{
|
||||
// check vector load/store
|
||||
@@ -578,6 +583,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
if(!ck::is_xdl_supported())
|
||||
@@ -676,11 +682,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
std::map<LoopScheduler, std::string> LoopSchedToString{
|
||||
{LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
|
||||
std::map<LoopScheduler, std::string> LoopSchedToString{{LoopScheduler::Default, "Default"},
|
||||
{ LoopScheduler::Interwave,
|
||||
"Interwave" }};
|
||||
|
||||
std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
|
||||
{PipelineVersion::v2, "v2"}};
|
||||
{ PipelineVersion::v2,
|
||||
"v2" }};
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceGemmMultipleD_Xdl_CShuffle"
|
||||
@@ -709,6 +717,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
|
||||
return str.str();
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class ADesc, class BDesc, class DsDesc, class EDesc>
|
||||
struct Descriptor
|
||||
@@ -847,7 +856,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
|
||||
EDataType* __restrict__ p_e_grid)
|
||||
{
|
||||
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
|
||||
#ifndef __HIPCC_RTC__
|
||||
assert(desc.IsValid());
|
||||
#endif
|
||||
if(desc.has_main_k_block_loop)
|
||||
{
|
||||
GridwiseGemm::template Run<true>(p_a_grid,
|
||||
|
||||
@@ -13,6 +13,7 @@ enum struct MaskingSpecialization
|
||||
MaskOutUpperTriangle
|
||||
};
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
|
||||
{
|
||||
switch(s)
|
||||
@@ -22,6 +23,7 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
struct MaskDisabledPredicate
|
||||
{
|
||||
@@ -53,7 +55,7 @@ struct MaskOutUpperTrianglePredicate
|
||||
template <typename MaskOutPredicate>
|
||||
struct C0MatrixMask_impl
|
||||
{
|
||||
__host__ __device__ C0MatrixMask_impl(index_t NRaw)
|
||||
__host__ __device__ constexpr C0MatrixMask_impl(index_t NRaw)
|
||||
: NRaw_(NRaw), predicate_(MaskOutPredicate{})
|
||||
{
|
||||
}
|
||||
|
||||
@@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout
|
||||
|
||||
} // namespace convolution
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <
|
||||
typename Layout,
|
||||
typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
|
||||
@@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
|
||||
os << Layout::name;
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tensor_layout
|
||||
} // namespace ck
|
||||
|
||||
@@ -340,8 +340,8 @@ struct Bilinear
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<std::int8_t, std::int32_t, std::int8_t>(
|
||||
std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const
|
||||
__host__ __device__ constexpr void operator()<int8_t, int32_t, int8_t>(
|
||||
int8_t& y, const int32_t& x0, const int8_t& x1) const
|
||||
{
|
||||
y = type_convert<int8_t>(alpha_ * type_convert<float>(x0) +
|
||||
beta_ * type_convert<float>(x1));
|
||||
|
||||
@@ -466,7 +466,7 @@ struct FastGelu
|
||||
|
||||
template <typename Y, typename X>
|
||||
__device__ void operator()(Y& y, const X& x) const;
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <>
|
||||
__host__ void operator()<float, float>(float& y, const float& x) const
|
||||
{
|
||||
@@ -477,7 +477,7 @@ struct FastGelu
|
||||
const float emu = exp(u);
|
||||
y = x / (1.f + emu);
|
||||
}
|
||||
|
||||
#endif
|
||||
// device code, use lower precision "__ocml_exp_f32" and "rcp"
|
||||
template <>
|
||||
__device__ void operator()<float, float>(float& y, const float& x) const
|
||||
|
||||
@@ -7,8 +7,10 @@
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/tensor_description/tensor_adaptor.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <limits>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -979,7 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit
|
||||
const auto M0 = math::integer_divide_ceil(M, MPerBlock);
|
||||
const auto N0 = math::integer_divide_ceil(N, NPerBlock);
|
||||
|
||||
return std::make_tuple(N0, M0, k_split);
|
||||
return ck::make_tuple(N0, M0, k_split);
|
||||
}
|
||||
|
||||
template <typename TopIdx>
|
||||
@@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK
|
||||
uint32_t dp_for_sk_iters = k_iters_per_tile.get();
|
||||
|
||||
uint32_t best_sk_score =
|
||||
std::numeric_limits<int>::max(); // we need to find the smallest sk iters
|
||||
ck::NumericLimits<int>::Max(); // we need to find the smallest sk iters
|
||||
for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
|
||||
tentative_sk_blocks++)
|
||||
{
|
||||
|
||||
@@ -475,9 +475,9 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
|
||||
|
||||
template <typename DsLayout, GemmSpecialization GemmSpec>
|
||||
__host__ __device__ static auto
|
||||
MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
|
||||
const std::array<index_t, NumDTensor>& NRaws,
|
||||
const std::array<index_t, NumDTensor>& DsStride)
|
||||
MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
|
||||
const Array<index_t, NumDTensor>& NRaws,
|
||||
const Array<index_t, NumDTensor>& DsStride)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
@@ -941,7 +941,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
|
||||
const index_t K,
|
||||
const index_t StrideA,
|
||||
const index_t StrideB,
|
||||
const std::array<index_t, NumDTensor> StrideDs,
|
||||
const Array<index_t, NumDTensor> StrideDs,
|
||||
const index_t StrideE,
|
||||
const Block2ETileMap& block_2_etile_map)
|
||||
{
|
||||
|
||||
@@ -3,8 +3,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
|
||||
@@ -53,12 +55,15 @@ constexpr auto GridwiseGemmPipeline_Selector()
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef __HIPCC_RTC__
|
||||
std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
|
||||
{
|
||||
switch(p)
|
||||
@@ -71,3 +76,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1005,6 +1005,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
|
||||
index_t offset,
|
||||
index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <typename T, index_t NumElemsPerThread>
|
||||
__device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const index_t global_offset,
|
||||
@@ -1042,5 +1043,6 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -7,10 +7,12 @@
|
||||
#include "ck/utility/functional2.hpp"
|
||||
#include "ck/utility/math.hpp"
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
namespace detail {
|
||||
@@ -37,7 +39,7 @@ struct get_carrier<3>
|
||||
{
|
||||
using value_type = uint32_t;
|
||||
|
||||
std::array<std::byte, 3> bytes;
|
||||
Array<ck::byte, 3> bytes;
|
||||
static_assert(sizeof(bytes) <= sizeof(value_type));
|
||||
|
||||
// replacement of host std::copy_n()
|
||||
@@ -61,22 +63,22 @@ struct get_carrier<3>
|
||||
// method to trigger template substitution failure
|
||||
__device__ carrier(const carrier& other) noexcept
|
||||
{
|
||||
copy_n(other.bytes.begin(), bytes.size(), bytes.begin());
|
||||
copy_n(other.bytes.begin(), bytes.Size(), bytes.begin());
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ carrier& operator=(value_type value) noexcept
|
||||
{
|
||||
copy_n(reinterpret_cast<const std::byte*>(&value), bytes.size(), bytes.begin());
|
||||
copy_n(reinterpret_cast<const ck::byte*>(&value), bytes.Size(), bytes.begin());
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ operator value_type() const noexcept
|
||||
{
|
||||
std::byte result[sizeof(value_type)];
|
||||
ck::byte result[sizeof(value_type)];
|
||||
|
||||
copy_n(bytes.begin(), bytes.size(), result);
|
||||
copy_n(bytes.begin(), bytes.Size(), result);
|
||||
|
||||
return *reinterpret_cast<const value_type*>(result);
|
||||
}
|
||||
@@ -109,8 +111,8 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)
|
||||
{
|
||||
constexpr unsigned object_size = sizeof(int64_t);
|
||||
constexpr unsigned second_part_offset = object_size / 2;
|
||||
auto* const from_obj = reinterpret_cast<const std::byte*>(&value);
|
||||
alignas(int64_t) std::byte to_obj[object_size];
|
||||
auto* const from_obj = reinterpret_cast<const ck::byte*>(&value);
|
||||
alignas(int64_t) ck::byte to_obj[object_size];
|
||||
|
||||
using Sgpr = uint32_t;
|
||||
|
||||
@@ -124,15 +126,15 @@ __device__ inline int64_t amd_wave_read_first_lane(int64_t value)
|
||||
|
||||
template <
|
||||
typename Object,
|
||||
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
|
||||
typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
|
||||
__device__ auto amd_wave_read_first_lane(const Object& obj)
|
||||
{
|
||||
using Size = unsigned;
|
||||
constexpr Size SgprSize = 4;
|
||||
constexpr Size ObjectSize = sizeof(Object);
|
||||
|
||||
auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);
|
||||
alignas(Object) std::byte to_obj[ObjectSize];
|
||||
auto* const from_obj = reinterpret_cast<const ck::byte*>(&obj);
|
||||
alignas(Object) ck::byte to_obj[ObjectSize];
|
||||
|
||||
constexpr Size RemainedSize = ObjectSize % SgprSize;
|
||||
constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
|
||||
|
||||
@@ -38,6 +38,8 @@ struct Array
|
||||
}
|
||||
__host__ __device__ constexpr const TData* begin() const { return &mData[0]; }
|
||||
__host__ __device__ constexpr const TData* end() const { return &mData[NSize]; }
|
||||
__host__ __device__ constexpr TData* begin() { return &mData[0]; }
|
||||
__host__ __device__ constexpr TData* end() { return &mData[NSize]; }
|
||||
};
|
||||
|
||||
// empty Array
|
||||
@@ -54,7 +56,7 @@ template <typename X, typename... Xs>
|
||||
__host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
|
||||
{
|
||||
using data_type = remove_cvref_t<X>;
|
||||
return Array<data_type, sizeof...(Xs) + 1>{std::forward<X>(x), std::forward<Xs>(xs)...};
|
||||
return Array<data_type, sizeof...(Xs) + 1>{ck::forward<X>(x), ck::forward<Xs>(xs)...};
|
||||
}
|
||||
|
||||
// make empty array
|
||||
|
||||
@@ -326,14 +326,14 @@ template <typename T, index_t NX, index_t NY>
|
||||
__host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
|
||||
{
|
||||
return unpack2(
|
||||
[&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
|
||||
[&](auto&&... zs) { return make_array(ck::forward<decltype(zs)>(zs)...); }, ax, ay);
|
||||
}
|
||||
|
||||
template <typename... X, typename... Y>
|
||||
__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
|
||||
{
|
||||
return unpack2(
|
||||
[&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
|
||||
[&](auto&&... zs) { return make_tuple(ck::forward<decltype(zs)>(zs)...); }, tx, ty);
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
|
||||
@@ -5,8 +5,25 @@
|
||||
|
||||
#include "ck/utility/statically_indexed_array.hpp"
|
||||
|
||||
#ifdef __HIPCC_RTC__
|
||||
/// Definitions from <cstdint>, <cmath> conflict with
|
||||
/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
|
||||
|
||||
using int8_t = signed char;
|
||||
using uint8_t = unsigned char;
|
||||
using int16_t = signed short;
|
||||
using uint16_t = unsigned short;
|
||||
using float_t = float;
|
||||
#endif // __HIPCC_RTC__
|
||||
|
||||
namespace ck {
|
||||
|
||||
#ifdef __HIPCC_RTC__
|
||||
using byte = unsigned char;
|
||||
#else
|
||||
using std::byte;
|
||||
#endif
|
||||
|
||||
using bhalf_t = ushort;
|
||||
using half_t = _Float16;
|
||||
using int4_t = _BitInt(4);
|
||||
@@ -1060,6 +1077,146 @@ using uint8x16_t = typename vector_type<uint8_t, 16>::type;
|
||||
using uint8x32_t = typename vector_type<uint8_t, 32>::type;
|
||||
using uint8x64_t = typename vector_type<uint8_t, 64>::type;
|
||||
|
||||
#ifdef __HIPCC_RTC__
|
||||
template <typename T>
|
||||
struct NumericLimits;
|
||||
|
||||
template <>
|
||||
struct NumericLimits<int32_t>
|
||||
{
|
||||
__host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
|
||||
|
||||
__host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
|
||||
|
||||
__host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
|
||||
|
||||
__host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<int16_t>
|
||||
{
|
||||
__host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
|
||||
|
||||
__host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
|
||||
|
||||
__host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
|
||||
|
||||
__host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<int8_t>
|
||||
{
|
||||
__host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
|
||||
|
||||
__host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
|
||||
|
||||
__host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
|
||||
|
||||
__host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<uint32_t>
|
||||
{
|
||||
__host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
|
||||
|
||||
__host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<uint16_t>
|
||||
{
|
||||
__host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
|
||||
|
||||
__host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
|
||||
|
||||
__host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<float>
|
||||
{
|
||||
static constexpr unsigned int binary_min = 0x00800000;
|
||||
static constexpr unsigned int binary_max = 0x7F7FFFFF;
|
||||
static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
|
||||
static constexpr unsigned int binary_qnan = 0xFFC00001;
|
||||
static constexpr unsigned int binary_inf = 0x7F8000000;
|
||||
|
||||
__host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
|
||||
|
||||
__host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
|
||||
|
||||
__host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
|
||||
|
||||
__host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
|
||||
|
||||
__host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NumericLimits<half_t>
|
||||
{
|
||||
static constexpr unsigned short binary_min = 0x0400;
|
||||
static constexpr unsigned short binary_max = 0x7BFF;
|
||||
static constexpr unsigned short binary_lowest = 0xFBFF;
|
||||
static constexpr unsigned short binary_qnan = 0x7FFF;
|
||||
|
||||
__host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
|
||||
|
||||
__host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
|
||||
|
||||
__host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
|
||||
|
||||
__host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
|
||||
};
|
||||
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
template <>
|
||||
struct NumericLimits<int4_t>
|
||||
{
|
||||
__host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
|
||||
|
||||
__host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
|
||||
|
||||
__host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
|
||||
};
|
||||
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
|
||||
template <>
|
||||
struct NumericLimits<f8_t>
|
||||
{
|
||||
static constexpr uint8_t binary_min = 0x08; // 0b00001000
|
||||
static constexpr uint8_t binary_max = 0x77; // 0b01110111
|
||||
static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
|
||||
static constexpr uint8_t binary_qnan = 0x80; // 0b10000000
|
||||
|
||||
__host__ __device__ static constexpr f8_t Min() { return bit_cast<f8_t>(binary_min); }
|
||||
|
||||
__host__ __device__ static constexpr f8_t Max() { return bit_cast<f8_t>(binary_max); }
|
||||
|
||||
__host__ __device__ static constexpr f8_t Lowest() { return bit_cast<f8_t>(binary_lowest); }
|
||||
|
||||
__host__ __device__ static constexpr f8_t QuietNaN() { return bit_cast<f8_t>(binary_qnan); }
|
||||
};
|
||||
#else
|
||||
template <typename T>
|
||||
struct NumericLimits
|
||||
{
|
||||
@@ -1151,6 +1308,7 @@ struct NumericLimits<bf8_t>
|
||||
|
||||
__host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); }
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
struct NumericUtils
|
||||
|
||||
@@ -4,11 +4,26 @@
|
||||
#pragma once
|
||||
|
||||
namespace ck {
|
||||
#ifdef __HIPCC_RTC__
|
||||
template <bool B, class T = void>
|
||||
struct enable_if
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct enable_if<true, T>
|
||||
{
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <bool B, class T = void>
|
||||
using enable_if_t = typename enable_if<B, T>::type;
|
||||
|
||||
#else
|
||||
template <bool B, typename T = void>
|
||||
using enable_if = std::enable_if<B, T>;
|
||||
|
||||
template <bool B, typename T = void>
|
||||
using enable_if_t = typename std::enable_if<B, T>::type;
|
||||
|
||||
#endif
|
||||
} // namespace ck
|
||||
|
||||
@@ -183,3 +183,7 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
// environment variable to enable logging:
|
||||
// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
|
||||
CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
|
||||
|
||||
@@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y)
|
||||
{
|
||||
if constexpr(predicate)
|
||||
{
|
||||
return std::forward<X>(x);
|
||||
return ck::forward<X>(x);
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::forward<Y>(y);
|
||||
return ck::forward<Y>(y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ struct unpack_impl<Sequence<Is...>>
|
||||
template <typename F, typename X>
|
||||
__host__ __device__ constexpr auto operator()(F&& f, X&& x) const
|
||||
{
|
||||
return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
|
||||
return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -35,8 +35,8 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
|
||||
template <typename F, typename X, typename Y>
|
||||
__host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
|
||||
{
|
||||
return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
|
||||
std::forward<Y>(y).At(Number<Js>{})...);
|
||||
return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...,
|
||||
ck::forward<Y>(y).At(Number<Js>{})...);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
|
||||
{
|
||||
using X_ = remove_reference_t<X>;
|
||||
return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
|
||||
std::forward<F>(f), std::forward<X>(x));
|
||||
ck::forward<F>(f), ck::forward<X>(x));
|
||||
}
|
||||
|
||||
// TODO: properly implement unpack that takes any number of containers
|
||||
@@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
|
||||
using Y_ = remove_reference_t<Y>;
|
||||
return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
|
||||
typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
|
||||
std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
|
||||
ck::forward<F>(f), ck::forward<X>(x), ck::forward<Y>(y));
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -9,14 +9,14 @@ namespace detail {
|
||||
template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
|
||||
struct detector
|
||||
{
|
||||
using value_t = std::false_type;
|
||||
using value_t = ck::false_type;
|
||||
using type = Default;
|
||||
};
|
||||
|
||||
template <class Default, template <class...> class Op, class... Args>
|
||||
struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
|
||||
struct detector<Default, ck::void_t<Op<Args...>>, Op, Args...>
|
||||
{
|
||||
using value_t = std::true_type;
|
||||
using value_t = ck::true_type;
|
||||
using type = Op<Args...>;
|
||||
};
|
||||
} // namespace detail
|
||||
@@ -32,12 +32,12 @@ template <template <class...> class Op, class... Args>
|
||||
using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;
|
||||
|
||||
template <typename T>
|
||||
using is_pack2_invocable_t = decltype(std::declval<T&>().is_pack2_invocable);
|
||||
using is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable);
|
||||
|
||||
template <typename T>
|
||||
using is_pack4_invocable_t = decltype(std::declval<T&>().is_pack4_invocable);
|
||||
using is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable);
|
||||
|
||||
template <typename T>
|
||||
using is_pack8_invocable_t = decltype(std::declval<T&>().is_pack8_invocable);
|
||||
using is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable);
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#include <ostream>
|
||||
|
||||
#pragma once
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_adaptor.hpp"
|
||||
@@ -26,6 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
|
||||
{
|
||||
switch(s)
|
||||
@@ -36,3 +39,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -30,7 +30,7 @@ struct MagicDivision
|
||||
// WARNING: magic division is only applicable for division inside this range.
|
||||
// You should use the return value of CalculateMagicNumbers, if division is not inside this
|
||||
// range. The "else" logic below is to quiet down run-time error.
|
||||
if(divisor >= 1 && divisor <= INT32_MAX)
|
||||
if(divisor >= 1 && divisor <= ck::NumericLimits<int32_t>::Max())
|
||||
{
|
||||
uint32_t shift = 0;
|
||||
for(shift = 0; shift < 32; ++shift)
|
||||
|
||||
@@ -18,6 +18,7 @@ namespace math {
|
||||
extern "C" __device__ float __ocml_native_recip_f32(float);
|
||||
#endif
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
// math functions for the host, some are implemented by calling C++ std functions
|
||||
|
||||
static inline __host__ float abs(float x) { return std::abs(x); };
|
||||
@@ -457,6 +458,7 @@ inline __host__ double expm1<double>(double x)
|
||||
{
|
||||
return std::expm1(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
|
||||
|
||||
@@ -920,5 +922,23 @@ inline __device__ double expm1<double>(double x)
|
||||
return expm1(x);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline __device__ T cos(T x)
|
||||
{
|
||||
return ck::type_convert<T>(cosf(ck::type_convert<float>(x)));
|
||||
};
|
||||
|
||||
template <>
|
||||
inline __device__ float cos<float>(float x)
|
||||
{
|
||||
return cosf(x);
|
||||
};
|
||||
|
||||
template <>
|
||||
inline __device__ double cos<double>(double x)
|
||||
{
|
||||
return cos(x);
|
||||
};
|
||||
|
||||
} // namespace math
|
||||
} // namespace ck
|
||||
|
||||
@@ -7,7 +7,7 @@ namespace ck {
|
||||
|
||||
// Pseudo random number generator
|
||||
// version for fp32
|
||||
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
|
||||
template <typename T, uint32_t seed_t, ck::enable_if_t<ck::is_same<float, T>{}, bool> = false>
|
||||
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
|
||||
{
|
||||
uint32_t x = *(reinterpret_cast<uint32_t*>(&val));
|
||||
@@ -23,7 +23,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
|
||||
}
|
||||
|
||||
// version for fp16
|
||||
template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
|
||||
template <typename T, uint32_t seed_t, ck::enable_if_t<ck::is_same<half_t, T>{}, bool> = false>
|
||||
__host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
|
||||
{
|
||||
uint16_t x = *(reinterpret_cast<uint16_t*>(&val));
|
||||
@@ -40,12 +40,18 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
|
||||
// return 0 if data is not fp16 or fp32
|
||||
template <typename T,
|
||||
uint32_t seed_t,
|
||||
std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
|
||||
ck::enable_if_t<!(ck::is_same<float, T>{} || ck::is_same<half_t, T>{}), bool> = false>
|
||||
__host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
|
||||
{
|
||||
#ifdef __HIPCC_RTC__
|
||||
static_cast<void>(id);
|
||||
static_cast<void>(val);
|
||||
static_cast<void>(seed);
|
||||
#else
|
||||
std::ignore = id;
|
||||
std::ignore = val;
|
||||
std::ignore = seed;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
#include "ck/utility/integral_constant.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
@@ -900,6 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <ck::index_t... Is>
|
||||
std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
|
||||
{
|
||||
@@ -910,3 +913,4 @@ std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
|
||||
os << S::At(S::Size() - ck::Number<1>{}).value << "}";
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -32,7 +32,7 @@ struct TupleElementKeyData
|
||||
template <typename T,
|
||||
typename enable_if<!is_same<remove_cvref_t<T>, TupleElementKeyData>::value,
|
||||
bool>::type = false>
|
||||
__host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(std::forward<T>(v))
|
||||
__host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(ck::forward<T>(v))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ get_tuple_element_data_reference(TupleElementKeyData<Key, Data>&& x)
|
||||
template <typename Key, typename Data>
|
||||
__host__ __device__ constexpr Data get_tuple_element_data(const TupleElementKeyData<Key, Data>& x)
|
||||
{
|
||||
return std::forward(x.mData);
|
||||
return ck::forward(x.mData);
|
||||
}
|
||||
|
||||
template <typename Indices, typename... Xs>
|
||||
@@ -83,13 +83,13 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElementKeyData<TupleElementKey<I
|
||||
!is_same<remove_cvref_t<Y>, TupleImpl>::value,
|
||||
bool>::type = false>
|
||||
__host__ __device__ constexpr TupleImpl(Y&& y)
|
||||
: TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
|
||||
: TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Y>(y))...
|
||||
{
|
||||
}
|
||||
|
||||
template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
|
||||
__host__ __device__ constexpr TupleImpl(Ys&&... ys)
|
||||
: TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
|
||||
: TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Ys>(ys))...
|
||||
{
|
||||
static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
|
||||
"wrong! inconsistent size");
|
||||
@@ -123,14 +123,14 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
|
||||
template <typename Y,
|
||||
typename enable_if<sizeof...(Xs) == 1 && !is_same<remove_cvref_t<Y>, Tuple>::value,
|
||||
bool>::type = false>
|
||||
__host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
|
||||
__host__ __device__ constexpr Tuple(Y&& y) : base(ck::forward<Y>(y))
|
||||
{
|
||||
}
|
||||
|
||||
template <typename... Ys,
|
||||
typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
|
||||
false>
|
||||
__host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
|
||||
__host__ __device__ constexpr Tuple(Ys&&... ys) : base(ck::forward<Ys>(ys)...)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -210,7 +210,7 @@ using tuple_element_t = typename tuple_element<I, TTuple>::type;
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
|
||||
{
|
||||
return Tuple<remove_cvref_t<Xs>...>(std::forward<Xs>(xs)...);
|
||||
return Tuple<remove_cvref_t<Xs>...>(ck::forward<Xs>(xs)...);
|
||||
}
|
||||
|
||||
// https://en.cppreference.com/w/cpp/utility/tuple/tie
|
||||
|
||||
@@ -29,7 +29,7 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
|
||||
const Tuple<Y&...>& ty)
|
||||
{
|
||||
return unpack2(
|
||||
[&](auto&&... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
|
||||
[&](auto&&... zs) { return Tuple<decltype(zs)...>{ck::forward<decltype(zs)>(zs)...}; },
|
||||
tx,
|
||||
ty);
|
||||
}
|
||||
@@ -38,7 +38,7 @@ template <typename... X, typename... Y>
|
||||
__host__ __device__ constexpr auto concat_tuple(const Tuple<X...>& tx, const Tuple<Y...>& ty)
|
||||
{
|
||||
return unpack2(
|
||||
[&](auto... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
|
||||
[&](auto... zs) { return Tuple<decltype(zs)...>{ck::forward<decltype(zs)>(zs)...}; },
|
||||
tx,
|
||||
ty);
|
||||
}
|
||||
@@ -157,6 +157,7 @@ __host__ __device__ constexpr auto TupleReduce(F&& f, const Tuple<Ts...>& tuple)
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
|
||||
@@ -165,6 +166,7 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
|
||||
{
|
||||
return (is_detected<is_tuple, Ts>::value || ...);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <index_t depth = 0, typename T>
|
||||
__host__ __device__ constexpr auto TupleDepth(const T&)
|
||||
|
||||
@@ -8,6 +8,158 @@
|
||||
#include "ck/utility/enable_if.hpp"
|
||||
|
||||
namespace ck {
|
||||
#ifdef __HIPCC_RTC__
|
||||
template <bool B>
|
||||
using bool_constant = integral_constant<bool, B>;
|
||||
|
||||
using true_type = bool_constant<true>;
|
||||
using false_type = bool_constant<false>;
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define CK_BUILTIN_TYPE_TRAIT1(name) \
|
||||
template <class T> \
|
||||
struct name : bool_constant<__##name(T)> \
|
||||
{ \
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define CK_BUILTIN_TYPE_TRAIT2(name) \
|
||||
template <class T, class U> \
|
||||
struct name : bool_constant<__##name(T, U)> \
|
||||
{ \
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define CK_BUILTIN_TYPE_TRAITN(name) \
|
||||
template <class... Ts> \
|
||||
struct name : bool_constant<__##name(Ts...)> \
|
||||
{ \
|
||||
}
|
||||
|
||||
CK_BUILTIN_TYPE_TRAIT1(is_class);
|
||||
CK_BUILTIN_TYPE_TRAIT1(is_pointer);
|
||||
CK_BUILTIN_TYPE_TRAIT1(is_reference);
|
||||
CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
|
||||
CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
|
||||
CK_BUILTIN_TYPE_TRAIT2(is_base_of);
|
||||
|
||||
template <class T>
|
||||
struct remove_cv
|
||||
{
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct remove_cv<const T> : remove_cv<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct remove_cv<volatile T> : remove_cv<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct remove_reference
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_reference<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_reference<T&&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct remove_pointer
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_pointer<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_pointer<T* const>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_pointer<T* volatile>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <class T>
|
||||
struct remove_pointer<T* const volatile>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
|
||||
{
|
||||
return static_cast<T&&>(t_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
|
||||
{
|
||||
return static_cast<T&&>(t_);
|
||||
}
|
||||
|
||||
// TODO
|
||||
template<class T> struct is_const : false_type {};
|
||||
template<class T> struct is_const<const T> : true_type {};
|
||||
template< class T >
|
||||
inline constexpr bool is_const_v = is_const<T>::value;
|
||||
|
||||
template< class T >
|
||||
inline constexpr bool is_reference_v = is_reference<T>::value;
|
||||
|
||||
template<class T> struct remove_const { typedef T type; };
|
||||
template<class T> struct remove_const<const T> { typedef T type; };
|
||||
template< class T >
|
||||
using remove_const_t = typename remove_const<T>::type;
|
||||
|
||||
template< class T >
|
||||
inline constexpr bool is_class_v = is_class<T>::value;
|
||||
|
||||
template< class T >
|
||||
inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
|
||||
|
||||
template< class... >
|
||||
using void_t = void;
|
||||
|
||||
using __hip::declval;
|
||||
#else
|
||||
#include <utility>
|
||||
#include <type_traits>
|
||||
using std::forward;
|
||||
using std::is_base_of;
|
||||
using std::is_class;
|
||||
using std::is_pointer;
|
||||
using std::is_reference;
|
||||
using std::is_trivially_copyable;
|
||||
using std::is_unsigned;
|
||||
using std::remove_cv;
|
||||
using std::remove_pointer;
|
||||
using std::remove_reference;
|
||||
using std::is_const_v;
|
||||
using std::is_reference_v;
|
||||
using std::remove_const_t;
|
||||
using std::is_class_v;
|
||||
using std::is_trivially_copyable_v;
|
||||
using std::void_t;
|
||||
using std::false_type;
|
||||
using std::true_type;
|
||||
using std::declval;
|
||||
#endif
|
||||
|
||||
template <typename X, typename Y>
|
||||
struct is_same : public integral_constant<bool, false>
|
||||
@@ -23,19 +175,19 @@ template <typename X, typename Y>
|
||||
inline constexpr bool is_same_v = is_same<X, Y>::value;
|
||||
|
||||
template <typename T>
|
||||
using remove_reference_t = typename std::remove_reference<T>::type;
|
||||
using remove_reference_t = typename remove_reference<T>::type;
|
||||
|
||||
template <typename T>
|
||||
using remove_cv_t = typename std::remove_cv<T>::type;
|
||||
using remove_cv_t = typename remove_cv<T>::type;
|
||||
|
||||
template <typename T>
|
||||
using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
|
||||
using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;
|
||||
|
||||
template <typename T>
|
||||
using remove_pointer_t = typename std::remove_pointer<T>::type;
|
||||
using remove_pointer_t = typename remove_pointer<T>::type;
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
|
||||
inline constexpr bool is_pointer_v = is_pointer<T>::value;
|
||||
|
||||
template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
|
||||
__host__ __device__ constexpr Y bit_cast(const X& x)
|
||||
|
||||
@@ -17,10 +17,10 @@ namespace ck {
|
||||
// Convert X to Y, both X and Y are non-const data types.
|
||||
template <typename Y,
|
||||
typename X,
|
||||
std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
|
||||
ck::enable_if_t<!(ck::is_const_v<Y> || ck::is_const_v<X>), bool> = false>
|
||||
__host__ __device__ constexpr Y type_convert(X x)
|
||||
{
|
||||
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
|
||||
static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);
|
||||
|
||||
return static_cast<Y>(x);
|
||||
}
|
||||
@@ -28,13 +28,13 @@ __host__ __device__ constexpr Y type_convert(X x)
|
||||
// Convert X to Y, either X or Y is a const data type.
|
||||
template <typename Y,
|
||||
typename X,
|
||||
std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
|
||||
ck::enable_if_t<ck::is_const_v<Y> || ck::is_const_v<X>, bool> = false>
|
||||
__host__ __device__ constexpr Y type_convert(X x)
|
||||
{
|
||||
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
|
||||
static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);
|
||||
|
||||
using NonConstY = std::remove_const_t<Y>;
|
||||
using NonConstX = std::remove_const_t<X>;
|
||||
using NonConstY = ck::remove_const_t<Y>;
|
||||
using NonConstX = ck::remove_const_t<X>;
|
||||
return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
|
||||
}
|
||||
|
||||
@@ -104,7 +104,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
|
||||
template <typename Y, typename X>
|
||||
__host__ __device__ constexpr Y type_convert_sp(X x)
|
||||
{
|
||||
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
|
||||
static_assert(!ck::is_reference_v<Y> && !ck::is_reference_v<X>);
|
||||
|
||||
return static_cast<Y>(x);
|
||||
}
|
||||
@@ -166,7 +166,7 @@ template <>
|
||||
inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
|
||||
{
|
||||
constexpr int seed = 1254739;
|
||||
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
|
||||
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<long_index_t>(&x), x);
|
||||
#if defined(__gfx94__)
|
||||
union
|
||||
{
|
||||
@@ -206,7 +206,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
|
||||
constexpr bool clip = true;
|
||||
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
|
||||
constexpr int seed = 1254739;
|
||||
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
|
||||
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<long_index_t>(&x), x);
|
||||
return utils::
|
||||
cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
|
||||
x, rng);
|
||||
@@ -218,7 +218,7 @@ template <>
|
||||
inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
|
||||
{
|
||||
constexpr int seed = 1254739;
|
||||
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
|
||||
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<long_index_t>(&x), x);
|
||||
#if defined(__gfx94__)
|
||||
union
|
||||
{
|
||||
@@ -258,7 +258,7 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
|
||||
constexpr bool clip = true;
|
||||
constexpr f8_rounding_mode rm = f8_rounding_mode::stochastic;
|
||||
constexpr int seed = 1254739;
|
||||
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
|
||||
uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<long_index_t>(&x), x);
|
||||
return utils::
|
||||
cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
|
||||
x, rng);
|
||||
@@ -501,6 +501,7 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
template <typename Y, typename X, std::size_t NumElems>
|
||||
inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
|
||||
const std::array<X, NumElems>& x)
|
||||
@@ -510,6 +511,7 @@ inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
|
||||
y[i] = type_convert<Y>(x[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename Y, typename X, index_t NumElems>
|
||||
inline __host__ __device__ void array_convert(Array<Y, NumElems>& y, const Array<X, NumElems>& x)
|
||||
|
||||
Reference in New Issue
Block a user