mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
tweaking
This commit is contained in:
@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
0,
|
||||
b_thread_data_on_global,
|
||||
0})
|
||||
#if 0
|
||||
#if 1
|
||||
.template Run<Float, Float, address_space_t::generic, address_space_t::global>
|
||||
#else // tweaking
|
||||
.template Run_optimized_dst_address_calculation<Float,
|
||||
|
||||
@@ -78,10 +78,12 @@ struct NativeTensorCoordinate
|
||||
return coord;
|
||||
}
|
||||
|
||||
#if 0 // tweaking
|
||||
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
|
||||
{
|
||||
return tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
|
||||
|
||||
@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
#if 0 // tweaking
|
||||
// Calculate offset diff without updating tensor-coordinate
|
||||
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
|
||||
// then all calculation can be done at compile-time.
|
||||
@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate
|
||||
// For transformation of multi-index difference, not all transformation functions need to
|
||||
// know the old lower-index or the old upper-index. We pass both of them to the
|
||||
// transformation function. The transformation function itself decides to use them or not.
|
||||
return GetLowerCoordinate().CalculateOffsetDiff(tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()));
|
||||
const auto idx_low_diff = tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
|
||||
{
|
||||
@@ -209,7 +215,7 @@ struct TensorCoordinate
|
||||
private:
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
@@ -217,7 +223,7 @@ struct TensorCoordinate
|
||||
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
|
||||
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
|
||||
private:
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor
|
||||
return idx_low;
|
||||
}
|
||||
|
||||
// TODO: right now return value is constexpr because use of non-constepxr lambda
|
||||
// TODO: right now return value is not constexpr because use of non-constepxr lambda
|
||||
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
|
||||
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
|
||||
{
|
||||
|
||||
@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
|
||||
ThreadBufferData* p_thread_buffer) const
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
mThreadwiseLoad.template Run<BlockSrcData,
|
||||
ThreadBufferData,
|
||||
BlockSrcAddressSpace,
|
||||
@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
|
||||
BlockDstData* p_block_dst) const
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
mThreadwiseStore.template Run<ThreadBufferData,
|
||||
BlockDstData,
|
||||
ThreadBufferAddressSpace,
|
||||
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
|
||||
BlockDstData,
|
||||
ThreadBufferAddressSpace,
|
||||
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
|
||||
#else // tweaking
|
||||
mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
|
||||
BlockDstData,
|
||||
|
||||
@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
|
||||
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
|
||||
|
||||
#if 0 // debug
|
||||
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
|
||||
{
|
||||
print_sequence("src_linear_dim_mask", src_linear_dim_mask);
|
||||
print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
|
||||
long_vector_size == SrcDataPerAccess,
|
||||
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
|
||||
@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
const auto src_coord =
|
||||
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
// this is src compile-time offset
|
||||
#if 0
|
||||
// TODO: is this good implementation?
|
||||
#if 1 // tweaking
|
||||
// this is src compile-time offset
|
||||
const index_t src_linear_offset =
|
||||
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
|
||||
#elif 0
|
||||
#else
|
||||
// this is src compile-time offset
|
||||
const index_t src_linear_offset =
|
||||
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
|
||||
SrcDesc::CalculateOffset(make_zero_array<index_t, nDim>());
|
||||
#elif 1
|
||||
const index_t src_linear_offset =
|
||||
src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
#endif
|
||||
|
||||
// Check src vector's padding situation, only check the first data in
|
||||
@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
|
||||
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
|
||||
|
||||
#if 0 // debug
|
||||
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
|
||||
{
|
||||
print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
|
||||
print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
|
||||
long_vector_size == DstDataPerAccess,
|
||||
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
|
||||
@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
const auto dst_coord =
|
||||
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
// this is dst compile-time offset
|
||||
#if 0
|
||||
// TODO: is this good implementation?
|
||||
#if 1 // tweaking
|
||||
// this is dst compile-time offset
|
||||
const index_t dst_linear_offset =
|
||||
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
|
||||
#elif 0
|
||||
#else
|
||||
// this is dst compile-time offset
|
||||
const index_t dst_linear_offset =
|
||||
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
|
||||
DstDesc::CalculateOffset(make_zero_array<index_t, nDim>());
|
||||
#elif 1
|
||||
const index_t dst_linear_offset =
|
||||
dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
|
||||
#endif
|
||||
|
||||
// Check dst vector's padding situation, only check the first data in
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#define CK_UNSIGNED_INDEX_TYPE 0
|
||||
#define CK_DEVICE_BACKEND_AMD 1
|
||||
#define CK_USE_AMD_INTRINSIC 0
|
||||
#define CK_USE_AMD_INTRINSIC 1
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
|
||||
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 128;
|
||||
constexpr index_t HI = 17;
|
||||
|
||||
Reference in New Issue
Block a user