diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 6303c09f95..630f852e6d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -173,8 +173,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, - Sequence<1, 1, 1, 1>, decltype(in_e_n1_b_n2_block_desc.GetLengths()), InBlockCopySubLengths_E_N1_B_N2, InBlockCopyClusterLengths_E_N1_B_N2, @@ -216,8 +214,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, - Sequence<1, 1>, decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, @@ -427,8 +423,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ThreadwiseGenericTensorSliceCopy_v4r2, - Sequence<1, 1, 1, 0, 1>, decltype( out_k0_k1_n1_b_n2_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 5, 1>::type, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 0305a87924..ffd74d2f2f 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -145,8 +145,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, - Sequence<1, 1>, decltype(in_e_b_block_desc.GetLengths()), InBlockCopySubLengths_E_B, InBlockCopyClusterLengths_E_B, @@ -186,8 +184,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, - Sequence<1, 1>, decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, @@ -390,8 +386,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf ThreadwiseGenericTensorSliceCopy_v4r2< decltype(out_k0_k1_b0_b1_thread_desc), decltype(out_k0_k1_b0_b1_global_desc), - Sequence<1, 1, 1, 1>, - Sequence<1, 1, 0, 0>, decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 4, 1>::type, 3, diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 223d0d5bed..5b4805e9ee 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -325,14 +325,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 8004d57c2a..d6cc0cd46d 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -192,7 +192,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -200,7 +200,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index a934651911..175c44ae7c 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -353,7 +353,6 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } -#if 1 struct lambda_sequence_logical_and { template @@ -378,7 +377,7 @@ struct TransformedTensorDescriptor // check only one transform at a time template __host__ __device__ constexpr auto - operator()(const Transform& tran, LowDimensionId, UpDimensionId) const + operator()(Transform, LowDimensionId, UpDimensionId) const { // judge if transformation is linear constexpr bool is_linear_transform = Transform::IsLinearTransform(); @@ -392,23 +391,42 @@ struct TransformedTensorDescriptor // create linear mask for upper dimensions constexpr bool are_up_dim_linear = is_linear_transform && are_all_low_dim_linear; - constexpr auto mask_of_up_linear_dims = modifiy_sequence_by_ids( - typename uniform_sequence_gen::type{}, - typename uniform_sequence_gen::type{}, + constexpr auto mask_of_up_linear_dims = modify_sequence_elements_by_ids( + typename uniform_sequence_gen::type{}, + typename uniform_sequence_gen::type{}, UpDimensionId{}); return mask_of_up_linear_dims; } }; + // TODO: this is a hack, transform_tuples() doesn't compile, would complain about constexpr + template + __host__ __device__ static constexpr auto + dummy_transform_tuples_impl(F f, X x, Y y, Z z, Sequence) + { + return make_tuple(f(x.At(Number{}), y.At(Number{}), z.At(Number{}))...); + } + __host__ __device__ static constexpr auto GetLinearDimensionMask() { +#if 0 // create tuple of linear dimension masks, for all transformations constexpr auto tuple_of_linear_dimension_mask = transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{}); +#else + // create tuple of linear dimension masks, for all transformations + // TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr + constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl( + lambda_get_linear_dimension_mask_of_single_tranform{}, + Transforms{}, + LowDimensionIds{}, + UpDimensionIds{}, + typename arithmetic_sequence_gen<0, Transforms::Size(), 1>::type{}); +#endif // reduce tuple of masks into one mask constexpr auto linear_dimension_mask = @@ -444,6 +462,7 @@ struct TransformedTensorDescriptor typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); } +#if 0 __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() { // not implemented diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 7076e26d5b..d9c8060d04 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -680,8 +680,6 @@ struct BlockwiseGenericTensorSliceCopy_v3 template ::type, - SubLengths, - SrcDimAccessOrder, - SrcVectorAccessDim, - SrcDataPerAccess, - 1>; + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2; - using ThreadwiseStore = - ThreadwiseGenericTensorSliceCopy_v4r2::type, - DstLinearDimensionMask, - SubLengths, - DstDimAccessOrder, - DstVectorAccessDim, - 1, - DstDataPerAccess>; + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2; ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index ef43b3d380..4094b1f094 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -1130,8 +1130,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 // the other is device memory or LDS template {}); + // separate linear dimensions from non-linear dimensions + constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); + constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); static_assert( src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, @@ -1459,11 +1455,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask() - // is implemented - constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{}; - constexpr auto dst_nonlinear_dim_mask = - DstLinearDimensionMask::Transform(logical_not{}); + // separate linear dimensions from non-linear dimensions + constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); + constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); static_assert( dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index 815ca8466e..665db3ff31 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -125,6 +125,13 @@ transform_tuples_impl(F f, const X& x, const Y& y, Sequence) return make_tuple(f(x.At(Number{}), y.At(Number{}))...); } +template +__host__ __device__ constexpr auto +transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence) +{ + return make_tuple(f(x.At(Number{}), y.At(Number{}), z.At(Number{}))...); +} + } // namespace detail template @@ -141,5 +148,12 @@ __host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y) f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); } +template +__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z) +{ + return detail::transform_tuples_impl( + f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); +} + } // namespace ck #endif