From 545d9305687c083717274171fdb22c74a5b5615e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 24 Sep 2019 18:06:05 -0500 Subject: [PATCH] refactor --- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 2 +- ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 2 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 40 ++++++----- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 2 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 2 +- ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp | 2 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 66 ++++++++++++----- .../ConstantTensorDescriptor.hpp | 11 ++- .../multi_index_transform.hpp | 70 ++++++++++--------- .../tensor_coordinate_helper.hpp | 16 +++++ .../tensor_coordinate_v2.hpp | 19 ++--- .../tensor_description/tensor_descriptor.hpp | 33 ++++++--- .../blockwise_3d_tensor_op.hpp | 2 +- .../blockwise_4d_tensor_op.hpp | 2 +- .../blockwise_generic_tensor_slice_copy.hpp | 52 ++++++-------- .../threadwise_generic_tensor_slice_copy.hpp | 66 +++++++---------- composable_kernel/include/utility/array.hpp | 3 +- .../include/utility/functional.hpp | 6 ++ .../include/utility/sequence.hpp | 36 ++++++++-- ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 48 ++++++++++++- ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp | 2 +- driver/src/driver.cpp | 4 +- 23 files changed, 301 insertions(+), 187 deletions(-) create mode 100644 composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 9ad3f8148e..379ca76970 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw constexpr index_t E = C * Y * X; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) && (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 7e5c727d15..42931a6ae8 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t E = C * Y * X; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) && (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 6f92b30617..a124a67e2e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -107,7 +107,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded constexpr index_t E = C * Y * X; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) && (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index be747eaca3..4d941b5053 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -107,7 +107,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t E = C * Y * X; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) && (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); @@ -174,9 +174,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf decltype(in_e_n1_b_n2_global_desc), decltype(in_e_n1_b_n2_block_desc), Sequence<0, 1, 0, 1>, - Sequence<1, 0, 1, 0>, Sequence<1, 1, 1, 1>, - Sequence<0, 0, 0, 0>, decltype(in_e_n1_b_n2_block_desc.GetLengths()), InBlockCopySubLengths_E_N1_B_N2, InBlockCopyClusterLengths_E_N1_B_N2, @@ -219,9 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf decltype(wei_e_k_global_desc), decltype(wei_e_k_block_desc), Sequence<1, 1>, - Sequence<0, 0>, Sequence<1, 1>, - Sequence<0, 0>, decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, @@ -299,8 +295,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.Run(p_in_global, p_in_block_double); - blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run( + p_in_global, p_in_block_double); + blockwise_wei_copy.template Run( + p_wei_global, p_wei_block_double); } // LDS double buffer: main body @@ -331,15 +329,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadRegisterBuffer( + p_in_global, p_in_register_buffer); + blockwise_wei_copy.template RunLoadRegisterBuffer( + p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.template RunStoreRegisterBuffer( + p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.template RunStoreRegisterBuffer( + p_wei_register_buffer, p_wei_block_next); } } @@ -355,17 +357,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadRegisterBuffer( + p_in_global, p_in_register_buffer); + blockwise_wei_copy.template RunLoadRegisterBuffer( + p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, - p_wei_block_double + wei_block_space); + blockwise_in_copy.template RunStoreRegisterBuffer( + p_in_register_buffer, p_in_block_double + in_block_space); + blockwise_wei_copy.template RunStoreRegisterBuffer( + p_wei_register_buffer, p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -424,9 +428,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ThreadwiseGenericTensorSliceCopy_v4r2, - Sequence<0, 0, 0, 0, 0>, Sequence<1, 1, 1, 0, 1>, - Sequence<0, 0, 0, 1, 0>, decltype( out_k0_k1_n1_b_n2_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 5, 1>::type, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 1ec519aad6..a3ecf64941 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -84,7 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw constexpr index_t B = N * Ho * Wo; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) && (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index ad411be6db..1bc814c009 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -84,7 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t B = N * Ho * Wo; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) && (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index f3976d43bc..fbe387e788 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded constexpr index_t B = N * Ho * Wo; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) && (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 2172751e6e..0305a87924 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -90,7 +90,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t B = N * Ho * Wo; // sanity-check for vectorized memory load - static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) && + static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) && (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); @@ -145,6 +145,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, + Sequence<1, 1>, decltype(in_e_b_block_desc.GetLengths()), InBlockCopySubLengths_E_B, InBlockCopyClusterLengths_E_B, @@ -157,13 +159,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf InBlockCopyDataPerAccess_B>( {0, b_block_data_on_global}, {0, 0}); - // weight tensor - // global mem +// weight tensor +// global mem +#if 0 constexpr auto wei_e_k_global_desc = transform_tensor_descriptor(wei_k_c_y_x_global_desc, make_tuple(Merge>{}, PassThrough{}), make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); +#else // hack + constexpr auto wei_e_k_global_desc_old = + WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + + constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor( + wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides()); +#endif // LDS // be careful of LDS alignment @@ -176,6 +186,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, + Sequence<1, 1>, decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, @@ -253,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.Run(p_in_global, p_in_block_double); - blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run( + p_in_global, p_in_block_double); + blockwise_wei_copy.template Run( + p_wei_global, p_wei_block_double); } // LDS double buffer: main body @@ -285,15 +299,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadRegisterBuffer( + p_in_global, p_in_register_buffer); + blockwise_wei_copy.template RunLoadRegisterBuffer( + p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.template RunStoreRegisterBuffer( + p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.template RunStoreRegisterBuffer( + p_wei_register_buffer, p_wei_block_next); } } @@ -309,17 +327,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadRegisterBuffer( + p_in_global, p_in_register_buffer); + blockwise_wei_copy.template RunLoadRegisterBuffer( + p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, - p_wei_block_double + wei_block_space); + blockwise_in_copy.template RunStoreRegisterBuffer( + p_in_register_buffer, p_in_block_double + in_block_space); + blockwise_wei_copy.template RunStoreRegisterBuffer( + p_wei_register_buffer, p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -367,9 +387,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); // output threadwise copy - auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v4r2< + ThreadwiseGenericTensorSliceCopy_v4r2< decltype(out_k0_k1_b0_b1_thread_desc), decltype(out_k0_k1_b0_b1_global_desc), + Sequence<1, 1, 1, 1>, + Sequence<1, 1, 0, 0>, decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 4, 1>::type, 3, @@ -378,9 +400,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf {k_thread_data_on_global / K1, k_thread_data_on_global % K1, b_thread_data_on_global / B1, - b_thread_data_on_global % B1}); - - threadwise_out_copy.Run(p_out_thread, p_out_global); + b_thread_data_on_global % B1}) +#if 1 + .template Run_generic +#elif 1 + .template Run_optimized_dst_address_calculation +#endif + (p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index 65a7b219f6..6dfbe5f794 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -96,13 +96,12 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr auto GetElementSize() { - return Number{}, Number<1>{})>{}; + return Number{}, Number<1>{})>{}; } __host__ __device__ static constexpr auto GetElementSpace() { - constexpr index_t element_space_unaligned = accumulate_on_sequence( + constexpr index_t element_space_unaligned = reduce_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); return Number{}; @@ -155,7 +154,7 @@ struct ConstantTensorDescriptor constexpr auto multi_id = Sequence{}; - return Number{}, Number<0>{})>{}; } @@ -389,7 +388,7 @@ struct ConstantTensorDescriptor constexpr auto fold_intervals = Sequence{}; constexpr index_t fold_intervals_product = - accumulate_on_sequence(fold_intervals, math::multiplies{}, Number<1>{}); + reduce_on_sequence(fold_intervals, math::multiplies{}, Number<1>{}); constexpr auto unfold_length = GetLength(Number{}); constexpr auto unfold_stride = GetStride(Number{}); @@ -447,7 +446,7 @@ struct ConstantTensorDescriptor static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable"); // unfolded length, stride - constexpr index_t unfold_length = accumulate_on_sequence( + constexpr index_t unfold_length = reduce_on_sequence( GetLengths().Extract(middle), math::multiplies{}, Number<1>{}); constexpr index_t unfold_stride = GetStride(Number{}); diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index 47f2d97089..6acbe55233 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -41,11 +41,10 @@ struct PassThrough __host__ __device__ static constexpr bool IsLinearTransform() { return true; } - // TODO: should this function be here? should it be specific for padding check? __host__ __device__ static constexpr bool - IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) { - return false; + return true; } }; @@ -82,23 +81,38 @@ struct Pad __host__ __device__ static constexpr bool IsLinearTransform() { return true; } - // TODO: should this function be here? should it be specific for padding check? - __host__ __device__ constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) const + __host__ __device__ constexpr bool + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const { - bool flag = false; +#if 0 + struct lambda_no_pad + { + __host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; } + }; - static_for<0, nDim, 1>{}([&](auto idim) { - // only check if there is left-padding - static_if<(LeftPads::At(idim) != 0)>{}( - [&](auto) { flag = flag || idx_up[idim] < LeftPads::At(idim); }); + if(sequence_all_of(LeftPads{}, lambda_no_pad{}) && + sequence_all_of(RightPads{}, lambda_no_pad{})) + { + return true; + } + else +#endif + { + bool flag = true; - // only check if there is right-padding - static_if<(RightPads::At(idim) != 0)>{}([&](auto) { - flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowerLengths::At(idim); + static_for<0, nDim, 1>{}([&](auto idim) { + // only check if there is left-padding + static_if<(LeftPads::At(idim) != 0)>{}( + [&](auto) { flag = flag && idx_up[idim] >= LeftPads::At(idim); }); + + // only check if there is right-padding + static_if<(RightPads::At(idim) != 0)>{}([&](auto) { + flag = flag && (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim)); + }); }); - }); - return flag; + return flag; + } } }; @@ -155,16 +169,10 @@ struct Merge LowerLengths::PopFront(), math::multiplies{}, Number<1>{}) .PushBack(Number<1>{}); -#if 1 // would these 2 versions be compiled to same ISA? - // calculate index in each of the dimensions in the order of their dimension static_for<0, nDimLow - 1, 1>{}( lambda_CalculateLowerIndex(itmp, idx_low)); idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1]; -#else - static_for<0, nDimLow, 1>{}( - lambda_CalculateLowerIndex(itmp, idx_low)); -#endif return idx_low; } @@ -244,6 +252,7 @@ struct Merge }); // highest dimension, no out-of-bound check + if(borrow) { --idx_low_new(0); @@ -255,11 +264,10 @@ struct Merge __host__ __device__ static constexpr bool IsLinearTransform() { return false; } - // TODO: should this function be here? should it be specific for padding check? __host__ __device__ static constexpr bool - IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) { - return false; + return true; } }; @@ -304,11 +312,10 @@ struct Unmerge __host__ __device__ static constexpr bool IsLinearTransform() { return true; } - // TODO: should this function be here? should it be specific for padding check? __host__ __device__ static constexpr bool - IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) { - return false; + return true; } }; @@ -362,9 +369,9 @@ struct Embed __host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool - IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) { - return false; + return true; } }; @@ -404,11 +411,10 @@ struct Vectorize __host__ __device__ static constexpr bool IsLinearTransform() { return true; } - // TODO: should this function be here? should it be specific for padding check? __host__ __device__ static constexpr bool - IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */) { - return false; + return true; } }; diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp new file mode 100644 index 0000000000..2b0550f8ab --- /dev/null +++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp @@ -0,0 +1,16 @@ +#ifndef CK_TENSOR_COORDINATE_HELPER_HPP +#define CK_TENSOR_COORDINATE_HELPER_HPP + +#include "tensor_coordiante_v2.hpp" + +namespace ck { + +template +__host__ __device__ constexpr auto +make_tensor_coordinate_v2(TensorDesc, MultiIndex idx) +{ + return typename TensorCoordinate_v2::type(idx); +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 62dc8b4c9a..b88b52671a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -76,8 +76,7 @@ struct NativeTensorCoordinate return coord; } - // TODO: should this function be here? should it be specific for padding check? - __host__ __device__ static constexpr bool IsAnyLevelIndexInPaddingArea() { return false; } + __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } private: // mIndex may be saved and update, however, the value of some (or all) of its entries may @@ -166,11 +165,11 @@ struct TransformedTensorCoordinate return coord_up; } - // TODO: should this function be here? should it be specific for padding check? - __host__ __device__ constexpr bool IsAnyLevelIndexInPaddingArea() const + // this function should be inexpensive, because there is no upper-to-lower index transformation + __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const { - return tensor_desc_type::IsUpperIndexInPaddingArea(GetIndex()) || - mCoordLow.IsAnyLevelIndexInPaddingArea(); + return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) && + mCoordLow.IsUpperIndexMappedToValidOffset(); } private: @@ -206,11 +205,5 @@ struct TensorCoordinate_v2 using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); }; -template -__host__ __device__ constexpr auto -make_tensor_coordinate_v2(TensorDesc, MultiIndex idx) -{ - return typename TensorCoordinate_v2::type(idx); -} -} +} // namespace ck #endif diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index f307286f70..dc2039355c 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -66,12 +66,12 @@ struct NativeTensorDescriptor __host__ __device__ static constexpr index_t GetElementSize() { - return accumulate_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); + return reduce_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); } __host__ __device__ static constexpr index_t GetElementSpace() { - return accumulate_on_sequence( + return reduce_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); } @@ -120,10 +120,10 @@ struct NativeTensorDescriptor } #endif - // TODO: should this function be here? should it be specific for padding check? - __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */) + __host__ __device__ static constexpr bool + IsUpperIndexMappedToValidOffset(const Index& /* idx */) { - return false; + return true; } }; @@ -290,7 +290,7 @@ struct TransformedTensorDescriptor __host__ __device__ static constexpr index_t GetElementSize() { - return accumulate_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); + return reduce_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); } __host__ __device__ static constexpr index_t GetElementSpace() @@ -375,7 +375,7 @@ struct TransformedTensorDescriptor constexpr bool is_linear_transform = tran.IsLinearTransform(); // judge if all lower dimension are linear - constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence( + constexpr bool is_all_low_dim_linear = math::reduce_on_sequence( pick_sequence_elements_by_mask( GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}), math::logic_and{}, @@ -441,21 +441,32 @@ struct TransformedTensorDescriptor } #endif - // TODO: should this function be here? should it be specific for padding check? - __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) + __host__ __device__ static constexpr bool + IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) { - bool flag = false; + bool flag = true; static_for<0, nTransform, 1>{}([&](auto itran) { constexpr auto tran = Transforms{}.At(itran); const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran)); - flag = flag || tran.IsUpperIndexInPaddingArea(to_array(idx_up_part)); + flag = flag && tran.IsUpperIndexMappedToValidLowerIndex(to_array(idx_up_part)); }); return flag; } + + // Whenever this function is called, it will call CalculateLowerIndex() recursively + // If you have created a tensor coordinate already, instead of calling this function, + // you should call TransformedTensorCoordinate::IsUpperIndexMappedToValidOffset() + __host__ __device__ static constexpr bool + IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up) + { + return IsUpperIndexMappedToValidLowerIndex(idx_up) && + GetLowerTensorDescriptor().IsUpperIndexMappedToValidOffset( + CalculateLowerIndex(idx_up)); + } }; } // namespace ck diff --git a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp index 0c4ed9d5c1..9ba37acd6d 100644 --- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp @@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); + reduce_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { diff --git a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp index 4185e066fb..93e859469e 100644 --- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp @@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); + reduce_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index e4a9af67be..7076e26d5b 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -681,9 +681,7 @@ template + template __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { -#if 0 - mThreadwiseLoad.Run_generic(p_src, p_buffer); -#elif 1 - // hardcoded: src is global memory - mThreadwiseLoad.template Run_generic(p_src, p_buffer); -#elif 1 - // hardcoded: src is global memory - mThreadwiseLoad - .template Run_optimized_src_address_calculation( - p_src, p_buffer); +#if 1 + mThreadwiseLoad.template Run_generic( + p_src, p_buffer); +#else + mThreadwiseLoad.template Run_optimized_src_address_calculation( + p_src, p_buffer); #endif } - template + template __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { -#if 0 - mThreadwiseStore.Run_generic(p_buffer, p_dst); -#elif 1 - // hardcoded: dst is lds - mThreadwiseStore.template Run_generic(p_buffer, p_dst); -#elif 1 - // hardcoded: dst is lds - mThreadwiseStore - .template Run_optimized_dst_address_calculation(p_buffer, +#if 1 + mThreadwiseStore.template Run_generic( + p_buffer, p_dst); +#else + mThreadwiseStore.template Run_optimized_dst_address_calculation(p_buffer, p_dst); #endif } - template + template __device__ void Run(const TData* p_src, TData* p_dst) const { TData p_buffer[GetRegisterBufferSize()]; - RunLoadRegisterBuffer(p_src, p_buffer); - RunStoreRegisterBuffer(p_buffer, p_dst); + RunLoadRegisterBuffer(p_src, p_buffer); + RunStoreRegisterBuffer(p_buffer, p_dst); } template @@ -802,9 +798,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ThreadwiseGenericTensorSliceCopy_v4r2::type, - typename uniform_sequence_gen::type, SubLengths, SrcDimAccessOrder, SrcVectorAccessDim, @@ -815,9 +809,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 ThreadwiseGenericTensorSliceCopy_v4r2::type, - typename uniform_sequence_gen::type, DstLinearDimensionMask, - DstNonLinearDimensionMask, SubLengths, DstDimAccessOrder, DstVectorAccessDim, diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 99148042f2..ef43b3d380 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -1131,9 +1131,7 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 template {}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE @@ -1260,13 +1257,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id); -// Check dst vector's padding situation, only check the first data in this dst -// vector. It's user's responsiblity to make sure all data in the dst vector has -// the same padding situation -// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary -#if 0 // tuning - if(!dst_coord.IsAnyLevelIndexInPaddingArea()) -#endif + // Check dst vector's padding situation, only check the first data in this dst + // vector. It's user's responsiblity to make sure all data in the dst vector has + // the same padding situation + if(dst_coord.IsUpperIndexMappedToValidOffset()) { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE @@ -1303,7 +1297,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of src tensor - template + template __device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; @@ -1321,8 +1317,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask() // is implemented - constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{}; - constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{}; + constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{}; + constexpr auto src_nonlinear_dim_mask = + SrcLinearDimensionMask::Transform(logical_not{}); static_assert( src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, @@ -1392,9 +1389,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check src vector's padding situation, only check the first data in // this src vector. It's user's responsiblity to make sure all data in // the src vector has the same padding situation - // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is - // neccessary - if(!src_coord.IsAnyLevelIndexInPaddingArea()) + if(src_coord.IsUpperIndexMappedToValidOffset()) { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE @@ -1427,14 +1422,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id); -// Check dst vector's padding situation, only check the first data in -// this dst vector. It's user's responsiblity to make sure all data in -// the dst vector has the same padding situation -// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is -// neccessary -#if 0 // tuning - if(!dst_coord.IsAnyLevelIndexInPaddingArea()) -#endif + // Check dst vector's padding situation, only check the first data in + // this dst vector. It's user's responsiblity to make sure all data in + // the dst vector has the same padding situation + if(dst_coord.IsUpperIndexMappedToValidOffset()) { *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast(&p_long_vector[buffer_offset]); @@ -1450,7 +1441,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of dst tensor - template + template __device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; @@ -1468,8 +1461,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask() // is implemented - constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{}; - constexpr auto dst_nonlinear_dim_mask = DstNonLinearDimensionMask{}; + constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{}; + constexpr auto dst_nonlinear_dim_mask = + DstLinearDimensionMask::Transform(logical_not{}); static_assert( dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, @@ -1535,9 +1529,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check src vector's padding situation, only check the first data in // this src vector. It's user's responsiblity to make sure all data in // the src vector has the same padding situation - // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is - // neccessary - if(!src_coord.IsAnyLevelIndexInPaddingArea()) + if(src_coord.IsUpperIndexMappedToValidOffset()) { *reinterpret_cast(&p_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); @@ -1561,14 +1553,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const index_t dst_linear_offset = dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); -// Check dst vector's padding situation, only check the first data in -// this dst vector. It's user's responsiblity to make sure all data in -// the dst vector has the same padding situation -// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is -// neccessary -#if 0 // tuning - if(!dst_coord.IsAnyLevelIndexInPaddingArea()) -#endif + // Check dst vector's padding situation, only check the first data in + // this dst vector. It's user's responsiblity to make sure all data in + // the dst vector has the same padding situation + if(dst_coord.IsUpperIndexMappedToValidOffset()) { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp index b0ffa86785..213b20530d 100644 --- a/composable_kernel/include/utility/array.hpp +++ b/composable_kernel/include/utility/array.hpp @@ -110,8 +110,7 @@ struct ArrayElementPicker __host__ __device__ explicit constexpr ArrayElementPicker(Arr& array) : mArray{array} { - constexpr index_t imax = - accumulate_on_sequence(Picks{}, math::maxer{}, Number<0>{}); + constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer{}, Number<0>{}); static_assert(imax < Arr::Size(), "wrong! exceeding # array element"); } diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp index 3dd469c8bc..6232e2243f 100644 --- a/composable_kernel/include/utility/functional.hpp +++ b/composable_kernel/include/utility/functional.hpp @@ -25,6 +25,12 @@ struct swallow } }; +template +struct logical_not +{ + constexpr bool operator()(const T& x) const { return !x; } +}; + // Emulate if constexpr template struct static_if; diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp index 55bda8aede..3ccc8c5f65 100644 --- a/composable_kernel/include/utility/sequence.hpp +++ b/composable_kernel/include/utility/sequence.hpp @@ -764,12 +764,12 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask) #endif template -struct lambda_accumulate_on_sequence +struct lambda_reduce_on_sequence { const Reduce& f; index_t& result; - __host__ __device__ constexpr lambda_accumulate_on_sequence(const Reduce& f_, index_t& result_) + __host__ __device__ constexpr lambda_reduce_on_sequence(const Reduce& f_, index_t& result_) : f(f_), result(result_) { } @@ -783,14 +783,42 @@ struct lambda_accumulate_on_sequence template __host__ __device__ constexpr index_t -accumulate_on_sequence(Seq, Reduce f, Number /*initial_value*/) +reduce_on_sequence(Seq, Reduce f, Number /*initial_value*/) { index_t result = Init; - static_for<0, Seq::mSize, 1>{}(lambda_accumulate_on_sequence(f, result)); + static_for<0, Seq::Size(), 1>{}(lambda_reduce_on_sequence(f, result)); return result; } +// TODO: a generic any_of for any container +template +__host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/) +{ + bool flag = false; + + for(index_t i = 0; i < Seq::Size(); ++i) + { + flag = flag || f(Seq::At(i)); + } + + return flag; +} + +// TODO: a generic all_of for any container +template +__host__ __device__ constexpr bool sequence_all_of(Seq, F f /*initial_value*/) +{ + bool flag = true; + + for(index_t i = 0; i < Seq::Size(); ++i) + { + flag = flag && f(Seq::At(i)); + } + + return flag; +} + } // namespace ck #endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 17813f09a4..679009680c 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -33,9 +33,18 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; +#if 1 constexpr auto in_nchw_desc = InDesc{}; constexpr auto wei_kcyx_desc = WeiDesc{}; constexpr auto out_nkhw_desc = OutDesc{}; +#else + constexpr auto in_nchw_desc = + make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides()); + constexpr auto wei_kcyx_desc = + make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides()); + constexpr auto out_nkhw_desc = + make_native_tensor_descriptor(OutDesc::GetLegnths(), OutDesc::GetStrides()); +#endif constexpr index_t N = out_nkhw_desc.GetLength(I0); constexpr index_t K = out_nkhw_desc.GetLength(I1); @@ -88,7 +97,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 1 +#elif 0 // BlockSize = 64, each thread hold 64 data constexpr index_t BlockSize = 64; @@ -125,6 +134,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 0 + // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 64; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmNRepeat = 2; + + constexpr index_t GemmMPerThreadSubC = 2; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 2; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 1, 4>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 16, 1>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<2, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif constexpr index_t N1 = GemmNRepeat; diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index 304fbe211d..974b89e0c9 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp" +//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" template