mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
Improve buffer address for out of bound check (#21)
* Use buffer load built-in OOB check. buffer size is limited to 2GB. * buffer APIs use combined wave and thread offset * use uint32_t for addr shift in buffer addressing
This commit is contained in:
@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
|
||||
|
||||
constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor(
|
||||
constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_y_ho_x_wo_global_desc,
|
||||
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
|
||||
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
// output tensor
|
||||
constexpr auto out_gemmk_gemmn_global_desc =
|
||||
constexpr auto out_gemmm_gemmn_global_desc =
|
||||
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
|
||||
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
|
||||
@@ -130,8 +130,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
|
||||
Float,
|
||||
AccFloat,
|
||||
decltype(wei_gemmk_gemmm_global_desc),
|
||||
decltype(in_gemmm_gemmn_global_desc),
|
||||
decltype(out_gemmk_gemmn_global_desc),
|
||||
decltype(in_gemmk_gemmn_global_desc),
|
||||
decltype(out_gemmm_gemmn_global_desc),
|
||||
InMemoryDataOperation::Set,
|
||||
GemmMPerBlock,
|
||||
GemmNPerBlock,
|
||||
|
||||
@@ -84,21 +84,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
|
||||
ThreadBufferData* p_thread_buffer) const
|
||||
{
|
||||
constexpr bool has_optimized_address_calculation =
|
||||
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
|
||||
|
||||
if(BlockSize == mThreadClusterDesc.GetElementSize() or
|
||||
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
|
||||
{
|
||||
// TODO: threadwise copy is still being tweaked
|
||||
if(has_optimized_address_calculation)
|
||||
{
|
||||
mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
|
||||
}
|
||||
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,22 +95,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
|
||||
BlockDstData* p_block_dst) const
|
||||
{
|
||||
constexpr bool has_optimized_address_calculation =
|
||||
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
|
||||
|
||||
if(BlockSize == mThreadClusterDesc.GetElementSize() or
|
||||
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
|
||||
{
|
||||
// TODO: threadwise copy is still being tweaked
|
||||
if(has_optimized_address_calculation)
|
||||
{
|
||||
mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer,
|
||||
p_block_dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
|
||||
}
|
||||
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
// buffer to hold a src long-vector
|
||||
SrcData p_src_long_vector[long_vector_size];
|
||||
|
||||
#if 1
|
||||
// zero out buffer
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_src_long_vector[i] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
@@ -112,17 +114,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
// Check src data's valid mapping situation, only check the first data in this src
|
||||
// vector. It's user's responsiblity to make sure all data in the src vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<SrcData,
|
||||
SrcDataPerRead,
|
||||
SrcAddressSpace,
|
||||
AddressSpace::Vgpr,
|
||||
InMemoryDataOperation::Set,
|
||||
SrcDataStride,
|
||||
1>(
|
||||
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
|
||||
}
|
||||
transfer_data<SrcData,
|
||||
SrcDataPerRead,
|
||||
SrcAddressSpace,
|
||||
AddressSpace::Vgpr,
|
||||
InMemoryDataOperation::Set,
|
||||
SrcDataStride,
|
||||
1>(p_src,
|
||||
src_coord.GetOffset(),
|
||||
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
|
||||
SrcDesc::GetElementSpace(),
|
||||
p_src_long_vector,
|
||||
buffer_offset,
|
||||
true,
|
||||
long_vector_size);
|
||||
}
|
||||
|
||||
// SrcData to DstData conversion
|
||||
@@ -146,336 +151,24 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
// Check dst data's valid mapping situation, only check the first data in this dst
|
||||
// vector. It's user's responsiblity to make sure all data in the dst vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<DstData,
|
||||
DstDataPerWrite,
|
||||
AddressSpace::Vgpr,
|
||||
DstAddressSpace,
|
||||
DstInMemOp,
|
||||
1,
|
||||
DstDataStride>(
|
||||
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
|
||||
}
|
||||
transfer_data<DstData,
|
||||
DstDataPerWrite,
|
||||
AddressSpace::Vgpr,
|
||||
DstAddressSpace,
|
||||
DstInMemOp,
|
||||
1,
|
||||
DstDataStride>(p_dst_long_vector,
|
||||
buffer_offset,
|
||||
true,
|
||||
long_vector_size,
|
||||
p_dst,
|
||||
dst_coord.GetOffset(),
|
||||
dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
|
||||
DstDesc::GetElementSpace());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Modify Length to 1, if Mask is set to false
|
||||
// Used for isolating linear dimension from non-linear dimensions
|
||||
template <index_t... Lengths, index_t... Mask>
|
||||
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
|
||||
{
|
||||
return Sequence<(Mask ? Lengths : 1)...>{};
|
||||
}
|
||||
|
||||
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
|
||||
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
|
||||
// This version is optimized for address calculation of src tensor
|
||||
// TODO: this function is not compiled to expected ISA
|
||||
template <typename SrcData, typename DstData>
|
||||
__device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
|
||||
DstData* p_dst) const
|
||||
{
|
||||
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
// separate linear dimensions from non-linear dimensions
|
||||
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
|
||||
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
|
||||
|
||||
static_assert(
|
||||
src_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == SrcDataPerRead,
|
||||
"Warning! SrcDstVectorReadWriteDim is not SrcDesc's linear dimension, performance "
|
||||
"would drop");
|
||||
|
||||
// separate steps into linear and non-linear components, accoording to src tensor
|
||||
constexpr auto linear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
|
||||
|
||||
constexpr auto nonlinear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
|
||||
|
||||
// loop over src's non-linear dimensions
|
||||
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
|
||||
auto nonlinear_dim_long_vector_access_id) {
|
||||
|
||||
// calculate step-sizes along src's nonlinear dimensions
|
||||
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
|
||||
nonlinear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// move src cooridnate along nonlinear dimensions
|
||||
// this coordinate contains run-time per-thread offset
|
||||
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
|
||||
|
||||
// loop over src's linear dimensions
|
||||
ford<decltype(linear_long_vector_access_lengths)>{}([&](
|
||||
auto linear_dim_long_vector_access_id) {
|
||||
|
||||
// step-sizes along src's linear dimensions
|
||||
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
|
||||
linear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
SrcData p_src_long_vector[long_vector_size];
|
||||
|
||||
// zero out buffer
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_src_long_vector[i] = 0;
|
||||
}
|
||||
|
||||
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
|
||||
// long-vector buffer.
|
||||
// If SrcDstVectorReadWriteDim is src's linear dimension, then src's
|
||||
// offset-diff due to this looping is known at compile-time. If
|
||||
// SrcDstVectorReadWriteDim is src's nonlinear dimension, then src's
|
||||
// offset-diff due to this looping is only known at run-time. For best
|
||||
// performance, SrcDstVectorReadWriteDim, should be src's linear dimension
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * src_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
// move src cooridnate along linear dimensions
|
||||
const auto src_coord =
|
||||
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
|
||||
// this is src compile-time offset
|
||||
const index_t src_linear_offset =
|
||||
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
#else
|
||||
// this is src compile-time offset
|
||||
const index_t src_linear_offset =
|
||||
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
|
||||
#endif
|
||||
|
||||
// Check src data's valid mapping situation, only check the first data in this
|
||||
// src
|
||||
// vector. It's user's responsiblity to make sure all data in the src vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<SrcData,
|
||||
SrcDataPerRead,
|
||||
SrcAddressSpace,
|
||||
AddressSpace::Vgpr,
|
||||
InMemoryDataOperation::Set>(p_src,
|
||||
src_nonlinear_coord.GetOffset() +
|
||||
src_linear_offset,
|
||||
p_src_long_vector,
|
||||
buffer_offset);
|
||||
}
|
||||
}
|
||||
|
||||
// SrcData to DstData conversion
|
||||
DstData p_dst_long_vector[long_vector_size];
|
||||
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * dst_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
// dst offset is calculated here, without explicitly separating into
|
||||
// compile-time and per-thread component
|
||||
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
|
||||
linear_dim_data_steps + scalar_id);
|
||||
|
||||
// Check dst data's valid mapping situation, only check the first data in this
|
||||
// dst
|
||||
// vector. It's user's responsiblity to make sure all data in the dst vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<DstData,
|
||||
DstDataPerWrite,
|
||||
AddressSpace::Vgpr,
|
||||
DstAddressSpace,
|
||||
DstInMemOp>(
|
||||
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// This version is optimized for address calculation of dst tensor
|
||||
// TODO: this function is not compiled to expected ISA
|
||||
template <typename SrcData, typename DstData>
|
||||
__device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
|
||||
DstData* p_dst) const
|
||||
{
|
||||
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
// separate linear dimensions from non-linear dimensions
|
||||
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
|
||||
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
|
||||
|
||||
static_assert(
|
||||
dst_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == DstDataPerWrite,
|
||||
"Warning! SrcDstVectorReadWriteDim is not DstDesc's linear dimension, performance "
|
||||
"would drop");
|
||||
|
||||
// separate steps into linear and non-linear components, accoording to dst tensor
|
||||
constexpr auto linear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
|
||||
|
||||
constexpr auto nonlinear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
|
||||
|
||||
// loop over dst's non-linear dimensions
|
||||
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
|
||||
auto nonlinear_dim_long_vector_access_id) {
|
||||
|
||||
// calculate step-sizes along dst's nonlinear dimensions
|
||||
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
|
||||
nonlinear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// move dst cooridnate along nonlinear dimensions
|
||||
// this coordinate contains run-time per-thread offset
|
||||
const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
|
||||
|
||||
// loop over dst's linear dimensions
|
||||
ford<decltype(linear_long_vector_access_lengths)>{}([&](
|
||||
auto linear_dim_long_vector_access_id) {
|
||||
|
||||
// step-sizes along dst's linear dimensions
|
||||
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
|
||||
linear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
SrcData p_src_long_vector[long_vector_size];
|
||||
|
||||
// zero out buffer
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_src_long_vector[i] = 0;
|
||||
}
|
||||
|
||||
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
|
||||
// long-vector buffer.
|
||||
// If SrcDstVectorReadWriteDim is dst's linear dimension, then dst's
|
||||
// offset-diff due to this looping is known at compile-time. If
|
||||
// SrcDstVectorReadWriteDim is dst's nonlinear dimension, then dst's
|
||||
// offset-diff due to this looping is only known at run-time. For best
|
||||
// performance, SrcDstVectorReadWriteDim, should be dst's linear dimension
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * src_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
// src offset is calculated here, without explicitly separating into
|
||||
// compile-time and per-thread component
|
||||
const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
|
||||
linear_dim_data_steps + scalar_id);
|
||||
|
||||
// Check src data's valid mapping situation, only check the first data in this
|
||||
// src
|
||||
// vector. It's user's responsiblity to make sure all data in the src vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<SrcData,
|
||||
SrcDataPerRead,
|
||||
SrcAddressSpace,
|
||||
AddressSpace::Vgpr,
|
||||
InMemoryDataOperation::Set>(
|
||||
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
|
||||
}
|
||||
}
|
||||
|
||||
// SrcData to DstData conversion
|
||||
DstData p_dst_long_vector[long_vector_size];
|
||||
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * dst_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
// move dst cooridnate along linear dimensions
|
||||
const auto dst_coord =
|
||||
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
|
||||
// this is dst compile-time offset
|
||||
const index_t dst_linear_offset =
|
||||
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
#else
|
||||
// this is dst compile-time offset
|
||||
const index_t dst_linear_offset =
|
||||
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
|
||||
#endif
|
||||
|
||||
// Check dst data's valid mapping situation, only check the first data in this
|
||||
// dst
|
||||
// vector. It's user's responsiblity to make sure all data in the dst vector
|
||||
// has the valid/invalid mapping situation
|
||||
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
|
||||
{
|
||||
transfer_data<DstData,
|
||||
DstDataPerWrite,
|
||||
AddressSpace::Vgpr,
|
||||
DstAddressSpace,
|
||||
DstInMemOp>(p_dst_long_vector,
|
||||
buffer_offset,
|
||||
p_dst,
|
||||
dst_nonlinear_coord.GetOffset() +
|
||||
dst_linear_offset);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
|
||||
{
|
||||
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
|
||||
integral_constant<bool, PositiveDirection>)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -49,12 +49,13 @@
|
||||
#endif
|
||||
|
||||
// experimental implementation
|
||||
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
|
||||
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
|
||||
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
|
||||
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
|
||||
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
|
||||
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
|
||||
|
||||
@@ -47,38 +47,69 @@ struct SetData
|
||||
|
||||
// This version is only for compatibility, don't use this version if possible
|
||||
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
|
||||
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
|
||||
__device__ void Run(const T* p_src,
|
||||
index_t src_offset,
|
||||
bool src_valid,
|
||||
index_t /* src_range */,
|
||||
T* p_dst,
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t /* dst_range */) const
|
||||
{
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
|
||||
if(dst_valid)
|
||||
{
|
||||
if(src_valid)
|
||||
{
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
|
||||
}
|
||||
else
|
||||
{
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CK_USE_AMD_BUFFER_ADDRESSING
|
||||
// buffer_load requires:
|
||||
// 1) p_src must be in global memory space, d_dst must be vgpr
|
||||
// 2) p_src to be a block-invariant pointer.
|
||||
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
|
||||
// 2) p_src_thread to be a wavewise pointer.
|
||||
// It is user's responsibility to make sure that is true.
|
||||
template <>
|
||||
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
|
||||
index_t src_offset,
|
||||
bool src_valid,
|
||||
index_t src_range,
|
||||
T* p_dst,
|
||||
index_t dst_offset) const
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t /* dst_range */) const
|
||||
{
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
|
||||
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
|
||||
if(dst_valid)
|
||||
{
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
|
||||
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
|
||||
}
|
||||
}
|
||||
|
||||
// buffer_store requires:
|
||||
// 1) p_src must be in vgpr space, d_dst must be global memory
|
||||
// 2) p_dst to be a block-invariant pointer.
|
||||
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
|
||||
// 2) p_dst_thread to be a wavewise pointer.
|
||||
// It is user's responsibility to make sure that is true.
|
||||
template <>
|
||||
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
|
||||
index_t src_offset,
|
||||
bool src_valid,
|
||||
index_t /* src_range */,
|
||||
T* p_dst,
|
||||
index_t dst_offset) const
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t dst_range) const
|
||||
{
|
||||
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
|
||||
const auto zeros = vector_t(0);
|
||||
|
||||
amd_buffer_store<T, DataPerAccess>(
|
||||
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -90,24 +121,43 @@ struct AtomicAddData
|
||||
|
||||
// This version is only for compatibility, don't use this version if possible
|
||||
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
|
||||
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
|
||||
__device__ void Run(const T* p_src,
|
||||
index_t src_offset,
|
||||
bool src_valid,
|
||||
index_t /* src_range */,
|
||||
T* p_dst,
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t /* dst_range */) const
|
||||
{
|
||||
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
|
||||
if(src_valid && dst_valid)
|
||||
{
|
||||
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
|
||||
}
|
||||
}
|
||||
|
||||
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
|
||||
// buffer_atomic_add requires:
|
||||
// 1) p_src must be in vgpr space, d_dst must be global memory
|
||||
// 2) p_dst to be a block-invariant pointer.
|
||||
// buffer_atomic requires:
|
||||
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
|
||||
// 2) p_dst_thread to be a wavewise pointer.
|
||||
// It is user's responsibility to make sure that is true.
|
||||
template <>
|
||||
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
|
||||
index_t src_offset,
|
||||
T* p_dst,
|
||||
index_t dst_offset) const
|
||||
index_t /* src_range */,
|
||||
bool src_valid T* p_dst,
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t dst_range) const
|
||||
{
|
||||
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
|
||||
const auto zeros = vector_t(0);
|
||||
|
||||
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
|
||||
p_dst,
|
||||
dst_offset,
|
||||
dst_valid,
|
||||
index_t dst_range);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -119,7 +169,14 @@ template <typename T,
|
||||
InMemoryDataOperation DstInMemOp,
|
||||
index_t SrcDataStride = 1,
|
||||
index_t DstDataStride = 1>
|
||||
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
|
||||
__device__ void transfer_data(const T* p_src,
|
||||
index_t src_offset,
|
||||
bool src_valid,
|
||||
index_t src_range,
|
||||
T* p_dst,
|
||||
index_t dst_offset,
|
||||
bool dst_valid,
|
||||
index_t dst_range)
|
||||
{
|
||||
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
|
||||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
|
||||
@@ -131,27 +188,41 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
|
||||
// TODO: use static_if::ElseIf
|
||||
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
|
||||
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
||||
p_src, src_offset, p_dst, dst_offset);
|
||||
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
|
||||
});
|
||||
|
||||
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
|
||||
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
||||
p_src, src_offset, p_dst, dst_offset);
|
||||
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
for(index_t i = 0; i < DataPerAccess; i++)
|
||||
for(index_t i = 0; i < DataPerAccess; ++i)
|
||||
{
|
||||
// TODO: use static_if::ElseIf
|
||||
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
|
||||
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
||||
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
|
||||
p_src,
|
||||
src_offset + i * SrcDataStride,
|
||||
src_valid,
|
||||
src_range,
|
||||
p_dst,
|
||||
dst_offset + i * DstDataStride,
|
||||
dst_valid,
|
||||
dst_range);
|
||||
});
|
||||
|
||||
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
|
||||
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
||||
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
|
||||
p_src,
|
||||
src_offset + i * SrcDataStride,
|
||||
src_valid,
|
||||
src_range,
|
||||
p_dst,
|
||||
dst_offset + i * DstDataStride,
|
||||
dst_valid,
|
||||
dst_range);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user