diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index f73557d438..f0ed466eef 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -51,7 +51,7 @@ template struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded { -#if 0 +#if 1 __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index b14720eb88..cce36456d8 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -187,16 +187,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // weight tensor // tensor descriptor in device memory, src of blockwise copy constexpr auto wei_e_k_global_desc = +#if 0 transform_tensor_descriptor(wei_k_c_y_x_global_desc, make_tuple(Merge>{}, PassThrough{}), make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); +#else // hack + make_native_tensor_descriptor_packed(Sequence{}); +#endif - // tensor descriptor in LDS, dst of blockwise copy - // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, - Number{}); + // tensor descriptor in LDS, dst of blockwise copy + // be careful of LDS alignment + constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 0330b22438..62dc8b4c9a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -47,7 +47,7 @@ struct NativeTensorCoordinate // mIndex is updated here, but some (or all) of its entries may never be used mIndex += idx_diff; - mOffset += tensor_desc_type::CalculateOffset(idx_diff); + mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff); return *this; } @@ -57,7 +57,7 @@ struct NativeTensorCoordinate // mIndex is updated here, but some (or all) of its entries may never be used mIndex -= idx_diff; - mOffset -= tensor_desc_type::CalculateOffset(idx_diff); + mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff); return *this; } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index a3956dc713..c9a7ec85b2 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -684,12 +684,10 @@ template ; - using SrcCoord = typename TensorCoordinate_v2::type; - using DstCoord = typename TensorCoordinate_v2::type; - - __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin, - DstCoord dst_block_slice_origin) + __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin, + const Index& dst_block_slice_origin) { static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() && diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 110ef53bc5..c7e084c882 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 using SrcCoord = typename TensorCoordinate_v2::type; using DstCoord = typename TensorCoordinate_v2::type; - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin, - DstCoord dst_slice_origin) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin, + const Index& dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { static_assert(nDim == SrcDesc::GetNumOfDimension() &&