From 012d3a071b4f0260b53a15b9430695fa40521a07 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 27 Sep 2019 16:38:11 -0500 Subject: [PATCH] tweaking --- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 14 ++++-- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_description/tensor_descriptor.hpp | 2 +- .../blockwise_generic_tensor_slice_copy.hpp | 10 ++--- .../threadwise_generic_tensor_slice_copy.hpp | 44 +++++-------------- .../include/utility/config_amd.hpp.in | 2 +- driver/src/driver.cpp | 2 +- 8 files changed, 31 insertions(+), 49 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 1fc948bfe8..faf8764507 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf 0, b_thread_data_on_global, 0}) -#if 0 +#if 1 .template Run #else // tweaking .template Run_optimized_dst_address_calculation __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -217,7 +223,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index db48b1a906..46e551ddd4 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 21deafe0f6..e202f73e99 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -319,7 +319,7 @@ struct TransformedTensorDescriptor return idx_low; } - // TODO: right now return value is constexpr because use of non-constepxr lambda + // TODO: right now return value is not constexpr because use of non-constepxr lambda __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff( const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old) { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 072091c872..e9cc89fca0 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, ThreadBufferData* p_thread_buffer) const { -#if 0 +#if 1 mThreadwiseLoad.template Run(p_thread_buffer, p_block_dst); + BlockDstData, + ThreadBufferAddressSpace, + BlockDstAddressSpace>(p_thread_buffer, p_block_dst); #else // tweaking mThreadwiseStore.template Run_optimized_dst_address_calculation()); -#elif 1 - const index_t src_linear_offset = - src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); + src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); #endif // Check src vector's padding situation, only check the first data in @@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); -#if 0 // debug - if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) - { - print_sequence("dst_linear_dim_mask", dst_linear_dim_mask); - print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask); - } -#endif - static_assert(dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, "Warning! VectorAccessDim is not DstDesc's linear dimension, performance " @@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const auto dst_coord = dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); -// this is dst compile-time offset -#if 0 - // TODO: is this good implementation? +#if 1 // tweaking + // this is dst compile-time offset const index_t dst_linear_offset = dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); -#elif 0 +#else + // this is dst compile-time offset const index_t dst_linear_offset = - DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) - - DstDesc::CalculateOffset(make_zero_array()); -#elif 1 - const index_t dst_linear_offset = - dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); + dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); #endif // Check dst vector's padding situation, only check the first data in diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 6057cd8b5c..437ed3ee8f 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -6,7 +6,7 @@ #define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_AMD 1 -#define CK_USE_AMD_INTRINSIC 0 +#define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 85298074b7..4319c4f7d6 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -74,7 +74,7 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 1 +#if 0 constexpr index_t N = 128; constexpr index_t C = 128; constexpr index_t HI = 17;