From 14315b72f33deeab492278476a88103ae78ff3cb Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 27 Sep 2019 15:24:27 -0500 Subject: [PATCH] tweaking --- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 22 ++++++++++-- .../tensor_coordinate_deprecated.hpp | 4 +-- .../threadwise_generic_tensor_slice_copy.hpp | 34 ++++++++++++++++--- driver/src/driver.cpp | 18 +++++----- 5 files changed, 60 insertions(+), 20 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index faf8764507..1fc948bfe8 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf 0, b_thread_data_on_global, 0}) -#if 1 +#if 0 .template Run #else // tweaking .template Run_optimized_dst_address_calculation __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -201,7 +217,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index 46e551ddd4..db48b1a906 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index ae50d65c3e..c21cf737c1 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -226,6 +226,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); +#if 0 // debug + if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) + { + print_sequence("src_linear_dim_mask", src_linear_dim_mask); + print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask); + } +#endif + static_assert(src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance " @@ -292,9 +300,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO: is this good implementation? const index_t src_linear_offset = src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); -#else +#elif 0 const index_t src_linear_offset = - SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id); + SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) - + SrcDesc::CalculateOffset(make_zero_array()); +#elif 1 + const index_t src_linear_offset = + src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); #endif // Check src vector's padding situation, only check the first data in @@ -384,6 +396,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); +#if 0 // debug + if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) + { + print_sequence("dst_linear_dim_mask", dst_linear_dim_mask); + print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask); + } +#endif + static_assert(dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, "Warning! VectorAccessDim is not DstDesc's linear dimension, performance " @@ -477,13 +497,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); // this is dst compile-time offset -#if 1 +#if 0 // TODO: is this good implementation? const index_t dst_linear_offset = dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); -#else +#elif 0 const index_t dst_linear_offset = - DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id); + DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) - + DstDesc::CalculateOffset(make_zero_array()); +#elif 1 + const index_t dst_linear_offset = + dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); #endif // Check dst vector's padding situation, only check the first data in diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 9fe0fb5dbc..85298074b7 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -74,20 +74,20 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 - constexpr index_t N = 64; - constexpr index_t C = 256; - constexpr index_t HI = 56; - constexpr index_t WI = 56; - constexpr index_t K = 256; +#if 1 + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; constexpr index_t Y = 1; - constexpr index_t X = 1; + constexpr index_t X = 7; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64;