diff --git a/composable_kernel/include/tensor_description/dimension.hpp b/composable_kernel/include/tensor_description/dimension.hpp index ebe2038ee1..566895b9a4 100644 --- a/composable_kernel/include/tensor_description/dimension.hpp +++ b/composable_kernel/include/tensor_description/dimension.hpp @@ -11,13 +11,6 @@ struct NativeDimension __host__ __device__ static constexpr auto GetLength() { return Number{}; } __host__ __device__ static constexpr auto GetStride() { return Number{}; } - - __host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; } - - __host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff) - { - return i_diff * Stride; - } }; } // namespace ck diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 0c6372094e..5114b2ce99 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -193,7 +193,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -201,7 +201,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index db48b1a906..46e551ddd4 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index b5df66aa44..072091c872 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4 { #if 0 mThreadwiseLoad.template Run(p_block_src, - p_thread_buffer); + ThreadBufferData, + BlockSrcAddressSpace, + ThreadBufferAddressSpace>(p_block_src, p_thread_buffer); #else // tweaking mThreadwiseLoad.template Run_optimized_src_address_calculation __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const { - mThreadwiseLoad.Run(p_block_src, - p_thread_buffer); + mThreadwiseLoad.template Run( + p_block_src, p_thread_buffer); } template __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const { - mThreadwiseStore.Run(p_thread_buffer, - p_block_dst); + mThreadwiseStore.template Run( + p_thread_buffer, p_block_dst); } template (&p_src_long_vector[buffer_offset]) = __buffer_load( - p_src, 0, src_coord.GetOffset()); + p_src, src_coord.GetOffset(), 0); #else *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); @@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, - 0, - dst_coord.GetOffset()); + dst_coord.GetOffset(), + 0); #else *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast(&p_dst_long_vector[buffer_offset]); @@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const auto src_coord = src_nonlinear_coord + (linear_dim_data_steps + scalar_id); - // this is src compile-time offset +// this is src compile-time offset +#if 0 // TODO: is this good implementation? const index_t src_linear_offset = src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); +#else + const index_t src_linear_offset = + SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id); +#endif // Check src vector's padding situation, only check the first data in // this src vector. It's user's responsiblity to make sure all data in @@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const auto dst_coord = dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); - // this is dst compile-time offset +// this is dst compile-time offset +#if 1 // TODO: is this good implementation? const index_t dst_linear_offset = dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); +#else + const index_t dst_linear_offset = + DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id); +#endif // Check dst vector's padding situation, only check the first data in // this dst vector. It's user's responsiblity to make sure all data in diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 437ed3ee8f..6057cd8b5c 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -6,7 +6,7 @@ #define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_AMD 1 -#define CK_USE_AMD_INTRINSIC 1 +#define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 42106a5222..9fe0fb5dbc 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -448,7 +448,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc,