From 9f46cdf5faebb000ba4c3da33fa8c0bd05fc614d Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 18 Sep 2019 00:15:57 -0500 Subject: [PATCH] experimenting global and buffer load/store --- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 81 +++++-- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_v2.hpp | 4 +- .../blockwise_generic_tensor_slice_copy.hpp | 20 +- .../threadwise_generic_tensor_slice_copy.hpp | 211 +++++++++++++++++- .../include/utility/amd_inline_asm.hpp | 108 +++++++++ .../include/utility/array_helper.hpp | 4 +- .../include/utility/config_amd.hpp.in | 12 +- .../include/utility/config_nvidia.hpp.in | 10 +- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 131 +++++++---- driver/src/driver.cpp | 22 +- 12 files changed, 503 insertions(+), 106 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index d00c92ff9e..25f334a6c9 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -225,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); + Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index c21ffe500f..2d729ab10f 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -224,14 +224,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); + constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, decltype(a_e_k_block_mtx_desc), decltype(b_e_n1bn2_block_mtx_desc), - decltype(c_k0k2_n1n2_thread_mtx_desc), + decltype(c_k0k1_n1n2_thread_mtx_desc), GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -258,12 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; + Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output - threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); - - const Float* p_wei_block_on_global = p_wei_global; + threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread); // LDS double buffer: preload data into LDS { @@ -294,14 +292,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads(); // LDS doubel buffer: load next data from device mem blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, - p_wei_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -319,13 +316,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads(); // LDS doubel buffer: load next data from device mem blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); @@ -347,6 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // copy output: register to global memory { +#if 0 constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster; @@ -392,17 +390,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 0 - ThreadwiseGenericTensorSliceCopy_v1r2< - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), - arithmetic_sequence_gen<0, 8, 1>::type, - 7, - 1, - 1>(make_zero_array(), make_zero_array()) - .Run(p_out_thread, p_out_thread_on_global); -#elif 1 ThreadwiseGenericTensorSliceCopy_v2r1< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), @@ -414,6 +401,54 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 1, 1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) .Run(p_out_thread, p_out_thread_on_global); +#else + constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + + // define tensor descriptor for threadwise copy + // output memory layout descriptor in register, src of threadwise copy + constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed( + Sequence{}); + + // output memory layout descriptor in device memory + constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc = + out_n_k_h_w_global_desc.Fold(I1, Number{}).Fold(I0, Number{}, Number{}); + + // output merged global tensor descriptor, dst of threadwise copy + constexpr auto out_k0_k1_n1_b_n2_global_merged_desc = + make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc, + Sequence<3>{}, + Sequence<4>{}, + Sequence<1>{}, + Sequence<0, 5, 6>{}, + Sequence<2>{}); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t k_thread_data_on_global = + k_block_data_on_global + c_thread_mtx_on_block.row; + + const index_t b_thread_data_on_global = + b_block_data_on_global + c_thread_mtx_on_block.col / N2; + + ThreadwiseGenericTensorSliceCopy_v2r1< + decltype(out_k0_k1_n1_b_n2_thread_mem_desc), + decltype(out_k0_k1_n1_b_n2_global_merged_desc), + decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()), + arithmetic_sequence_gen<0, 5, 1>::type, + arithmetic_sequence_gen<0, 5, 1>::type, + 3, + 3, + 1, + 1>({0, 0, 0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + 0, + b_thread_data_on_global, + 0}) + .template Run_amd_experiment(p_out_thread, p_out_global); #endif } } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 4600b682ac..1a2774b589 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -313,14 +313,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 831088ab25..62dc8b4c9a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -188,7 +188,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -196,7 +196,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index c9a7ec85b2..fba11478f8 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -439,11 +439,10 @@ struct BlockwiseGenericTensorSliceCopy_v2 { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - using SrcCoordinate = typename TensorCoordinate::type; - using DstCoordinate = typename TensorCoordinate::type; + using Index = MultiIndex; - __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, - DstCoordinate dst_block_slice_origin) + __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin, + const Index& dst_block_slice_origin) { static_assert( nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && @@ -485,13 +484,21 @@ struct BlockwiseGenericTensorSliceCopy_v2 template __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { +#if 0 mThreadwiseLoad.Run(p_src, p_buffer); +#else + mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); +#endif } template __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { +#if 0 mThreadwiseStore.Run(p_buffer, p_dst); +#else + mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); +#endif } template @@ -499,8 +506,13 @@ struct BlockwiseGenericTensorSliceCopy_v2 { TData p_buffer[GetRegisterBufferSize()]; +#if 0 mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseStore.Run(p_buffer, p_dst); +#else + mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); + mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); +#endif } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index c7e084c882..097ac78a8a 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -452,11 +452,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 { static constexpr index_t nDim = SliceLengths::GetSize(); + using Index = MultiIndex; + using SrcCoordinate = typename TensorCoordinate::type; using DstCoordinate = typename TensorCoordinate::type; - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin, - DstCoordinate dst_slice_origin) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin, + const Index& dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { static_assert(nDim == SrcDesc::GetNumOfDimension() && @@ -755,6 +757,211 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 } } + // memory-space + // 0: VGPR + // 1: LDS + // 2: global-memory + template + __device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const + { + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); + + TData p_buffer_[buffer_desc.GetElementSpace()]; + TData* p_buffer = p_buffer_; + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + constexpr auto src_access_lengths = SliceLengths::Modify( + src_vector_access_dim, + SliceLengths::Get(src_vector_access_dim) / src_data_per_access); + + // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t + // normal dimensions is known at compile time. + // Below is a hack to isolate merged dimension id from normal dimension id, so the + // corresponding offset can be calculated seperately at run-time and compile-time. + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // merged dimensions, and has value = 1 on normal dimensions; + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // normal dimensions, and has value = 1 on merged dimensions; + constexpr auto src_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto src_normal_dim_access_lengths = + src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; + + ford{}([&]( + auto src_merged_dim_access_id) { + + auto src_merged_dim_data_id = src_merged_dim_access_id; + src_merged_dim_data_id(src_vector_access_dim) = + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; + + // offset w.r.t. merged dimension need be computed at run-time, + const index_t src_merged_offset = + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + ford{}([&]( + auto src_normal_dim_access_id) { + + auto src_normal_dim_data_id = src_normal_dim_access_id; + src_normal_dim_data_id(src_vector_access_dim) = + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; + + // offset w.r.t. normal dimension is known at compile-time + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + src_vector_t vector_data; + + static_if{}([&](auto) { +#if 1 // source code + // Load vector from src. + // src can be all kinds of memory-space. + // In order for optimized global_load to be emitted by compiler, need to + // assume: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed) + // 3. src_merged_offset can be runtime value (no assumption imposed) + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); +#else // inline asm using buffer_load + // Load vector from src + // src's memory-space can only be global-memory (buffer_load inline-asm is + // used) + // In order for buffer_load to be valid, need to assume: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed) + // 3. src_merged_offset can be runtime value (no assumption imposed) + vector_data = buffer_load( + p_src, + static_cast(src_merged_offset), + static_cast(src_normal_offset)); +#endif + }).Else([&](auto) { + // Load vector from src. + // src can be all kinds of memory-space. + // In order for optimized global_load to be emitted by compiler, need to + // assume: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed) + // 3. src_merged_offset can be runtime value (no assumption imposed) + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); + }); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); + }); + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto dst_access_lengths = SliceLengths::Modify( + dst_vector_access_dim, + SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); + + constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto dst_normal_dim_access_lengths = + dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; + + ford{}( + [&](auto dst_merged_dim_access_id) { + + auto dst_merged_dim_data_id = dst_merged_dim_access_id; + dst_merged_dim_data_id(dst_vector_access_dim) = + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + // offset w.r.t. merged dimension need be computed at run-time, + const index_t dst_merged_offset = + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + ford{}([&]( + auto dst_normal_dim_access_id) { + + auto dst_normal_dim_data_id = dst_normal_dim_access_id; + dst_normal_dim_data_id(dst_vector_access_dim) = + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + dst_vector_t vector_data; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + // offset w.r.t. normal dimension is known at compile-time + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + static_if{}([&](auto) { +#if 1 // source code + // Write vector into dst. + // dst can be all kinds of memory-space + // In order for optmized global_store to be emitted by compiler, need to + // assume: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed) + // 3. dst_merged_offset can be runtime value (no assumption imposed) + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; +#else // inline asm using buffer_store + // Write vector into dst. + // dst's memory-space need to be global-memory (buffer_store is used) + // In order for optmized global_store to be emitted by compiler, need to + // assume: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed) + // 3. dst_merged_offset can be runtime value (no assumption imposed) + buffer_store( + vector_data, p_dst, dst_merged_offset, dst_normal_offset); +#endif + }).Else([&](auto) { + // Write vector into dst. + // dst can be all kinds of memory-space + // In order for optmized global_store to be emitted by compiler, need to + // assume: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed) + // 3. dst_merged_offset can be runtime value (no assumption imposed) + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; + }); + }); + }); + } + } + // T can be Sequence or Array template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 0a17b4bd3a..307c96c4a4 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -8,6 +8,114 @@ namespace ck { // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); +// buffer_load and buffer_store +template +__device__ typename vector_type::MemoryType +buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); + +template +__device__ void buffer_store(const typename vector_type::MemoryType& src, + T* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset); + +template <> +__device__ float buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ + float dst; + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + + return dst; +} + +template <> +__device__ vector_type::MemoryType buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ + vector_type::MemoryType dst; + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + + return dst; +} + +template <> +__device__ vector_type::MemoryType buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ + vector_type::MemoryType dst; + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + + return dst; +} + +template <> +__device__ void buffer_store(const float& src, + float* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset) +{ + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : + : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset)); +} + __device__ void vmcnt(index_t cnt) { if(cnt == 0) diff --git a/composable_kernel/include/utility/array_helper.hpp b/composable_kernel/include/utility/array_helper.hpp index 7608a195a3..34769af2f8 100644 --- a/composable_kernel/include/utility/array_helper.hpp +++ b/composable_kernel/include/utility/array_helper.hpp @@ -6,7 +6,7 @@ namespace ck { template -__host__ __device__ void print_array(const char* s, Array a) +__host__ __device__ void print_array(const char* s, Array a) { constexpr index_t nsize = a.GetSize(); @@ -90,7 +90,7 @@ __host__ __device__ void print_array(const char* s, Array a) } template -__host__ __device__ void print_array(const char* s, Array a) +__host__ __device__ void print_array(const char* s, Array a) { constexpr index_t nsize = a.GetSize(); diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 9b1542e224..664d78b86b 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -5,6 +5,7 @@ #include "hip/hip_fp16.h" #define CK_DEVICE_BACKEND_AMD 1 +#define CK_USE_UNSIGNED_INDEX_TYPE 1 #define CK_USE_AMD_INLINE_ASM 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 @@ -13,13 +14,10 @@ namespace ck { -using unsigned_t = uint32_t; -using signed_t = int; - -#if 0 // debug -using index_t = unsigned_t; +#if CK_USE_UNSIGNED_INDEX_TYPE +using index_t = uint32_t; #else -using index_t = signed_t; +using index_t = int32_t; #endif // For some reason, HIP compiler need this definition to generate optimal load and store @@ -27,6 +25,8 @@ using index_t = signed_t; typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float4_t __attribute__((ext_vector_type(4))); +typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); + template __device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1) { diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 05be2489bb..b2f8e3e43e 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -7,6 +7,7 @@ #include "helper_cuda.h" #define CK_DEVICE_BACKEND_NVIDIA 1 +#define CK_USE_UNSIGNED_INDEX_TYPE 0 #define CK_USE_AMD_INLINE_ASM 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 @@ -15,13 +16,10 @@ namespace ck { -using unsigned_t = uint32_t; -using signed_t = int; - -#if 0 // debug -using index_t = unsigned_t; +#if CK_USE_UNSIGNED_INDEX_TYPE +using index_t = uint32_t; #else -using index_t = signed_t; +using index_t = int32_t; #endif // For some reason, CUDA need this definition, otherwise diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 51b9e511af..4555ce873f 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -48,7 +48,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // each thread hold 64 data + // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -82,10 +82,47 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #elif 0 - // each thread hold 32 data + // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data + constexpr index_t BlockSize = 64; + + constexpr index_t BPerBlock = 8; + constexpr index_t KPerBlock = 64; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmNRepeat = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 2; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 1, 8, 1>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 32>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -107,7 +144,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 1, 4>; using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 16, 1>; using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] constexpr index_t InBlockCopySrcDataPerRead_B = 1; @@ -119,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif @@ -133,50 +170,50 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + constexpr auto gridwise_conv = +#if 0 + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw +#else + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer +#endif + {}; + for(index_t i = 0; i < nrepeat; ++i) { - constexpr auto gridwise_conv = -#if 0 - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw -#else - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer -#endif - {}; - float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 94c59c478c..93a871b83b 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -75,13 +75,13 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 8; - constexpr index_t C = 8; - constexpr index_t HI = 2; - constexpr index_t WI = 8; + constexpr index_t N = 64; + constexpr index_t C = 16; + constexpr index_t HI = 34; + constexpr index_t WI = 34; constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; + constexpr index_t Y = 3; + constexpr index_t X = 3; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; @@ -92,8 +92,8 @@ int main(int argc, char* argv[]) // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; - constexpr index_t HI = 32; - constexpr index_t WI = 32; + constexpr index_t HI = 34; + constexpr index_t WI = 34; constexpr index_t K = 128; constexpr index_t Y = 3; constexpr index_t X = 3; @@ -101,8 +101,8 @@ int main(int argc, char* argv[]) using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<1, 1>; - using RightPads = Sequence<1, 1>; + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% @@ -434,7 +434,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,