mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
refactoring for miopen
This commit is contained in:
@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
|
||||
|
||||
using Float4 = float4;
|
||||
using Float2 = float2;
|
||||
@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3
|
||||
|
||||
__device__ constexpr index_t GetRegisterClipboardSize() const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3
|
||||
*(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride]));
|
||||
#else
|
||||
static_assert(is_same<float, Float>::value && DataPerRead == 4,
|
||||
static_assert(is_same<float, Float>{} && DataPerRead == 4,
|
||||
"global_load is only for float4");
|
||||
|
||||
global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
|
||||
@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
|
||||
#else
|
||||
static_assert(is_same<float, Float>::value && DataPerRead == 4,
|
||||
static_assert(is_same<float, Float>{} && DataPerRead == 4,
|
||||
"ds_write_b128 is only for float4");
|
||||
|
||||
ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),
|
||||
|
||||
@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3
|
||||
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize()
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3
|
||||
|
||||
__device__ constexpr index_t GetRegisterClipboardSize() const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
@@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
__device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
{
|
||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||
constexpr auto b_block_mtx = BlockMatrixB{};
|
||||
@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
|
||||
|
||||
// assertion for inline asm
|
||||
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
|
||||
is_same<FloatC, float>::value,
|
||||
"Run_asm only deal with float\n");
|
||||
static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
|
||||
is_same<FloatC, float>{},
|
||||
"Run_amd_asm only deal with float\n");
|
||||
|
||||
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
|
||||
MPerThread == 8 && NPerThread == 8,
|
||||
"Run_asm cannot deal with this GEMM shape yet\n");
|
||||
"Run_amd_asm cannot deal with this GEMM shape yet\n");
|
||||
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
|
||||
|
||||
static_assert(
|
||||
BlockMatrixStrideA == 0 && BatchPerThread == 1,
|
||||
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
|
||||
static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
|
||||
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
|
||||
"1 for now\n");
|
||||
|
||||
using Float4 = vector_type<float, 4>::MemoryType;
|
||||
|
||||
@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
|
||||
|
||||
// assertion for inline asm
|
||||
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
|
||||
is_same<FloatC, float>::value,
|
||||
"Run_asm only deal with float\n");
|
||||
static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
|
||||
is_same<FloatC, float>{},
|
||||
"Run_amd_asm only deal with float\n");
|
||||
|
||||
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
|
||||
MPerThread == 8 && NPerThread == 8,
|
||||
"Run_asm cannot deal with this GEMM shape yet\n");
|
||||
"Run_amd_asm cannot deal with this GEMM shape yet\n");
|
||||
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
|
||||
|
||||
static_assert(
|
||||
BlockMatrixStrideA == 0 && BatchPerThread == 1,
|
||||
"Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
|
||||
static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
|
||||
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
|
||||
"1 for now\n");
|
||||
|
||||
using Float4 = vector_type<float, 4>::MemoryType;
|
||||
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "threadwise_gemm.hpp"
|
||||
|
||||
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
|
||||
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
// if following number are power of 2, index calculation shall be greatly reduced:
|
||||
@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
|
||||
"wrong! Cannot evenly divide work among\n");
|
||||
|
||||
static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
|
||||
static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
|
||||
decltype(GetThreadMatrixCLengths())>{},
|
||||
"wrong! ThreadMatrixC lengths is wrong");
|
||||
|
||||
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
@@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
}
|
||||
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
// TODO: this is not working correctly
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
__device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
{
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
|
||||
|
||||
// assertion for inline asm
|
||||
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
|
||||
is_same<FloatC, float>::value,
|
||||
"Run_asm only deal with float\n");
|
||||
static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
|
||||
is_same<FloatC, float>{},
|
||||
"Run_amd_asm only deal with float");
|
||||
|
||||
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
|
||||
MPerThread == 8 && NPerThread == 8,
|
||||
"Run_asm cannot deal with this GEMM shape yet\n");
|
||||
"Run_amd_asm cannot deal with this GEMM shape yet");
|
||||
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
|
||||
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read");
|
||||
|
||||
using Float4 = vector_type<float, 4>::MemoryType;
|
||||
|
||||
@@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
#endif
|
||||
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run(const FloatA* const __restrict__ p_a_block,
|
||||
const FloatB* const __restrict__ p_b_block,
|
||||
FloatC* const __restrict__ p_c_thread) const
|
||||
__device__ void Run_source(const FloatA* const __restrict__ p_a_block,
|
||||
const FloatB* const __restrict__ p_b_block,
|
||||
FloatC* const __restrict__ p_c_thread) const
|
||||
{
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
@@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
}
|
||||
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_RegisterDoubleBuffer(FloatA* const p_a_block,
|
||||
FloatB* const p_b_block,
|
||||
FloatC* p_c_thread) const
|
||||
__device__ void RunRegisterDoubleBuffer_source(FloatA* const p_a_block,
|
||||
FloatB* const p_b_block,
|
||||
FloatC* p_c_thread) const
|
||||
{
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
p_c_thread);
|
||||
}
|
||||
}
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
|
||||
{
|
||||
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
|
||||
Run_amd_asm(p_a_block, p_b_block, p_c_thread);
|
||||
#else
|
||||
Run_source(p_a_block, p_b_block, p_c_thread);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -6,6 +6,10 @@
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
|
||||
|
||||
// for now, only support SubLengths.Get() == 1 on a merged dimension that constains
|
||||
// for now, only support SubLengths == 1 on a merged dimension that constains
|
||||
// multiple original dimensions
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
// partial offset on each dimension
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr index_t idim = IDim;
|
||||
|
||||
constexpr auto src_partial_original_dims =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim);
|
||||
@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr index_t idim = IDim;
|
||||
|
||||
constexpr auto dst_partial_original_dims =
|
||||
DstDesc::GetContainedOriginalDimensions(IDim);
|
||||
@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
mThreadDstOffset = accumulate_on_array(
|
||||
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
#if 0
|
||||
if(get_block_1d_id() == 0)
|
||||
{
|
||||
printf("id %5u %5u: "
|
||||
"src_block_data_multi_id_begin: %u %u %u %u, "
|
||||
"thread_cluster_multi_id: %u %u %u %u, "
|
||||
"data_cluster_multi_id: %u %u %u %u, "
|
||||
"thread_data_multi_id_begin: %u %u %u %u, "
|
||||
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
|
||||
get_block_1d_id(),
|
||||
get_thread_local_1d_id(),
|
||||
src_block_data_multi_id_begin[0],
|
||||
src_block_data_multi_id_begin[1],
|
||||
src_block_data_multi_id_begin[2],
|
||||
src_block_data_multi_id_begin[3],
|
||||
thread_cluster_multi_id[0],
|
||||
thread_cluster_multi_id[1],
|
||||
thread_cluster_multi_id[2],
|
||||
thread_cluster_multi_id[3],
|
||||
data_cluster_multi_id[0],
|
||||
data_cluster_multi_id[1],
|
||||
data_cluster_multi_id[2],
|
||||
data_cluster_multi_id[3],
|
||||
thread_data_multi_id_begin[0],
|
||||
thread_data_multi_id_begin[1],
|
||||
thread_data_multi_id_begin[2],
|
||||
thread_data_multi_id_begin[3],
|
||||
mThreadSrcOffset,
|
||||
mThreadDstOffset);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize()
|
||||
@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
#if 0
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
|
||||
|
||||
const index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
|
||||
#else // HIP compiler performs better with these codes
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
constexpr auto src_thread_data_multi_id_begin =
|
||||
@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
|
||||
#else
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
|
||||
|
||||
const index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
|
||||
#endif
|
||||
|
||||
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
|
||||
@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
#if 0
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
const index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
|
||||
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
|
||||
#else // HIP compiler performs better with these codes
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
constexpr auto clipboard_data_multi_id_begin =
|
||||
@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
constexpr index_t dst_offset =
|
||||
DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
|
||||
#else
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
const index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
|
||||
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
|
||||
#endif
|
||||
|
||||
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
|
||||
@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
}
|
||||
|
||||
// When moving the slicing windows along a merged dimension, if the strides of the
|
||||
// contained (by the merged dimension) original dimensions are in descending order,
|
||||
// contained (by the merged dimension) original dimensions are not in descending order,
|
||||
// then there is no guarantee that the new offset will be larger than the old offset
|
||||
// for movement in positive direction (vice versue for movement in negative direction).
|
||||
// As a result, there is the possiblity that the offset calculation may result in
|
||||
@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
|
||||
{
|
||||
constexpr auto IDim = Number<IDim_>{};
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr index_t idim = IDim;
|
||||
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
|
||||
// logic for a merged dimension, also works for non-merged dimension, but its logic may
|
||||
@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
constexpr auto I = decltype(I_){};
|
||||
constexpr index_t idim_original = src_partial_original_dims.Get(I);
|
||||
|
||||
mThreadSrcOriginalMultiId(idim_original) =
|
||||
new_src_partial_original_multi_id[I.Get()];
|
||||
mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
|
||||
});
|
||||
|
||||
// calculate new partial offset on this merged dimension
|
||||
|
||||
@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
make_ConstantTensorDescriptor_packed(thread_cluster_lengths);
|
||||
|
||||
// sanity check: data type
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
|
||||
static_assert(is_same<Float, float>{}, "wrong! only support float for now!\n");
|
||||
|
||||
// sanity check: nDim
|
||||
static_assert(SrcDesc::GetNumOfDimension() == nDim &&
|
||||
@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto I = decltype(IDim){};
|
||||
constexpr index_t i = I.Get();
|
||||
constexpr index_t idim = IDim;
|
||||
// compiler: will it really compute index here, or be merged with
|
||||
// GetOffsetFromMultiIndex and
|
||||
// optimized away???
|
||||
src_data_multi_id(i) *= src_sub_lengths.Get(I);
|
||||
src_data_multi_id(idim) *= src_sub_lengths.Get(IDim);
|
||||
});
|
||||
|
||||
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
|
||||
|
||||
@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
|
||||
constexpr index_t nshift = NShift::mValue;
|
||||
|
||||
constexpr index_t did0_end =
|
||||
is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
|
||||
is_same<decltype(I0), IDim>{} ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
|
||||
|
||||
constexpr index_t did1_end =
|
||||
is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
|
||||
is_same<decltype(I1), IDim>{} ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
|
||||
|
||||
constexpr index_t did2_end =
|
||||
is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
|
||||
is_same<decltype(I2), IDim>{} ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
|
||||
|
||||
constexpr index_t did3_end =
|
||||
is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
|
||||
is_same<decltype(I3), IDim>{} ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
|
||||
|
||||
for(index_t did0 = 0; did0 < did0_end; ++did0)
|
||||
{
|
||||
|
||||
@@ -71,24 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
|
||||
integral_constant<bool, TransC>,
|
||||
FloatC* __restrict__ p_c_thread)
|
||||
{
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("p_a_thread: %f %f %f %f\n",
|
||||
p_a_thread[0],
|
||||
p_a_thread[1],
|
||||
p_a_thread[2],
|
||||
p_a_thread[3]);
|
||||
printf("p_b_thread: %f %f %f %f\n",
|
||||
p_b_thread[0],
|
||||
p_b_thread[1],
|
||||
p_b_thread[2],
|
||||
p_b_thread[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
static_if<TransA && (!TransB) && (!TransC)>{}([&](auto fwd) {
|
||||
constexpr auto a_mtx = MatrixA{};
|
||||
constexpr auto b_mtx = MatrixB{};
|
||||
constexpr auto c_mtx = MatrixC{};
|
||||
@@ -111,12 +94,10 @@ __device__ void threadwise_gemm(MatrixA,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
}).Else([&](auto fwd) {
|
||||
// not implemented
|
||||
assert(false);
|
||||
}
|
||||
static_assert(fwd(false), "wrong! support for this config is not implemented");
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Float,
|
||||
@@ -32,21 +36,18 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
|
||||
|
||||
#if 0
|
||||
// doesn't compile, because merged-tensor reordering is not implemented
|
||||
// TODO: implement tensor desc ops for merged-tensor
|
||||
constexpr auto src_strides_in_access_order =
|
||||
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
// TODO: do more sanity-check here, something like:
|
||||
// constexpr auto src_strides_in_access_order =
|
||||
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
constexpr auto dst_strides_in_access_order =
|
||||
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
// constexpr auto dst_strides_in_access_order =
|
||||
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
// check src/dst stride on the lowest access dimension
|
||||
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
|
||||
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
|
||||
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
|
||||
"vectorized read/write");
|
||||
#endif
|
||||
// // check src/dst stride on the lowest access dimension
|
||||
// static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
|
||||
// (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
|
||||
// "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
|
||||
// "vectorized read/write");
|
||||
|
||||
constexpr auto slice_lengths_in_access_order =
|
||||
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
|
||||
@@ -64,24 +65,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
|
||||
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
|
||||
|
||||
#if 1
|
||||
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
auto data_multi_id_in_access_order = access_multi_id;
|
||||
data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#else
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
|
||||
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
|
||||
|
||||
@@ -97,6 +81,23 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#else
|
||||
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
auto data_multi_id_in_access_order = access_multi_id;
|
||||
data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
|
||||
@@ -56,7 +56,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
|
||||
|
||||
static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
|
||||
static_for<0, nRead, 1>{}([&](auto IRead) {
|
||||
constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
|
||||
constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead * DataPerRead>{});
|
||||
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id);
|
||||
|
||||
@@ -177,8 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
|
||||
// pack data
|
||||
static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) {
|
||||
const auto dst_multi_id =
|
||||
ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get());
|
||||
const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite + IDstData);
|
||||
|
||||
const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
|
||||
|
||||
@@ -189,7 +188,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
});
|
||||
|
||||
// write data
|
||||
const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
|
||||
const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite);
|
||||
|
||||
const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user