From 5636576f9b297b6645677aae16d02a2625a8ff01 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 7 Aug 2019 18:27:10 -0500 Subject: [PATCH] bug fix in ford, forgot to reorder lengths --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 2 +- .../blockwise_generic_tensor_slice_copy.hpp | 13 ++++----- .../threadwise_generic_tensor_slice_copy.hpp | 27 +++++++------------ .../include/utility/config_amd.hpp.in | 1 + .../include/utility/config_nvidia.hpp.in | 1 + .../include/utility/functional3.hpp | 6 +++-- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- 7 files changed, 24 insertions(+), 28 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 555e68e594..938d203755 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 7, 1, 1>(make_zero_array(), make_zero_array()) - .Run_non_static(p_out_thread, p_out_thread_on_global); + .Run(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index d2161cb4ff..8b82c3978b 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 SrcDataPerAccess, 1>(make_zero_array(), make_zero_array()) - .Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); + .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); #endif }); } @@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1 // By position the origin of the per-thread window at the point, where multi-index // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy // is assuming each thread is copy a noraml (not merged) tensor. -// User need to guarantee this is true. -// By setting SubLengths = 1 at the merged dimension, this is always true; -// If in the future, you want to enable SubLengths > 1 at the merged dimension, -// special care in implementation is needed +// To satisfy this assumption, the user need to make sure that, on a merged dimension +// that constains multiple original dimensions, the length of the last original +// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on +// the merged dimension need to be 1. These sanity checks are performed in constructor +// of BlockwiseGenericTensorSliceCopy_v1 #if 0 threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc, p_buffer + buffer_offset, @@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 1, DstDataPerAccess>(make_zero_array(), make_zero_array()) - .Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); + .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); #endif }); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 31de9f2fa0..3fd9b0d37a 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -10,6 +10,10 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #endif +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 +#endif + namespace ck { // user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 @@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 static_ford{}([&]( auto long_vector_access_id) { + // data id w.r.t slicing-window constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); @@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 *reinterpret_cast(&p_long_vector[buffer_offset]); }); }); - } - - template - __device__ void Run_non_static(const TData* p_src, TData* p_dst) const - { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto vector_access_dim = Number{}; - - constexpr auto src_data_per_access = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto long_vector_size = Number{}; - - constexpr auto long_vector_access_lengths = SliceLengths::Modify( - vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - +#else ford{}( [&](auto long_vector_access_id) { + // data id w.r.t slicing-window auto long_vector_data_begin_id = long_vector_access_id; long_vector_data_begin_id(vector_access_dim) = @@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 *reinterpret_cast(&p_long_vector[buffer_offset]); } }); +#endif } private: diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index e11ac1b70e..c0484a1f18 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -8,6 +8,7 @@ #define CK_USE_AMD_INLINE_ASM 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 namespace ck { diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 673c2778b1..2287868410 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -10,6 +10,7 @@ #define CK_USE_AMD_INLINE_ASM 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 namespace ck { diff --git a/composable_kernel/include/utility/functional3.hpp b/composable_kernel/include/utility/functional3.hpp index 7bd2b8d0cd..f1c21d7f59 100644 --- a/composable_kernel/include/utility/functional3.hpp +++ b/composable_kernel/include/utility/functional3.hpp @@ -135,9 +135,11 @@ struct ford template __host__ __device__ constexpr void operator()(F f) const { - for(index_t i = 0; i < Lengths::Front(); ++i) + constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{}); + + for(index_t i = 0; i < ordered_lengths.Front(); ++i) { - ford_impl{}(f, Array{i}); + ford_impl{}(f, Array{i}); } } }; diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index f5efc83fdf..58827649a8 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t B = (N * Ho * Wo) / (N1 * N2); -#if 0 +#if 1 // each thread hold 64 data constexpr index_t BlockSize = 256;