diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp index 5b55ed46a5..85b2d7894c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -332,7 +332,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, p_wei_register_clipboard); -#if 1 +#if 0 if(get_block_1d_id() == 0) { printf("tid (%d %d), %f %f %f %f\n", diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..40a44ee5df --- /dev/null +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -0,0 +1,344 @@ +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP + +#include "common_header.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantMatrixDescriptor.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "blockwise_gemm.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" + +namespace ck { + +// B = merge(N, H, W) +template +struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw +{ + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I5 = Number<5>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_h_w_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_h_w_global_desc.GetLengths()[1]; + + constexpr index_t K = out_n_k_h_w_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_h_w_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_h_w_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t E = C * Y * X; + constexpr index_t B = N * Ho * Wo; + + static_assert((X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // divide block work by [K, B] + static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0, + "wrong! cannot divide work evenly among block"); + + constexpr index_t KBlockWork = K / KPerBlock; + constexpr index_t BBlockWork = B / BPerBlock; + + constexpr auto block_work_desc = + make_ConstantTensorDescriptor_packed(Sequence{}); + + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); + + const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; + + // input tensor + // tensor descriptor in device memory [N, Ho, Wo] + constexpr auto in_n_ho_wo_global_desc = + in_n_c_h_w_global_desc.Extract(I0, I2, I3) + .StridedSlice(I1, Number{}, Number{}) + .StridedSlice(I2, Number{}, Number{}); + + // batch descritpor for device memory + constexpr auto in_c_y_x_global_desc = + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) + .Extract(Sequence<1, 2, 3>{}); + + // merged tensor descriptor in device memory [E, B], src of blockwise copy + constexpr auto in_e_b_global_desc = + make_ConstantMergedTensorDescriptor(in_c_y_x_global_desc.Embed(in_n_ho_wo_global_desc), + Sequence<0, 1, 2>{}, + Sequence<3, 4, 5>{}); + + // memory layout descriptor in LDS [E, B], dst of blockwise copy + // be careful of LDS alignment + constexpr auto in_e_b_block_desc = + make_ConstantTensorDescriptor_packed(Sequence{}); + + // input blockwise copy + // slice a merged tensor, reorder and copy to a normal tensor + // this copy operator already has blockwise offset built-in + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v2, + NormalTensorCoordinate, + decltype(in_e_b_block_desc.GetLengths()), + InBlockCopySubLengths_E_B, + InBlockCopyClusterLengths_E_B, + InBlockCopyThreadClusterArrangeOrder>( + {0, b_block_data_on_global}, {0, 0}); + + // weight tensor + // tensor descriptor in device memory, src of blockwise copy + constexpr auto wei_e_k_global_desc = + wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + + // tensor descriptor in LDS, dst of blockwise copy + // be careful of LDS alignment + constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); + + // operator for blockwise copy of weight into LDS + // slice a tensor, and copy it into another tensor + // this copy operator already have blockwise offset built-in + auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< + BlockSize, + Float, + decltype(wei_e_k_global_desc), + decltype(wei_e_k_block_desc), + MergedTensorCoordinate, + NormalTensorCoordinate, + decltype(wei_e_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0}); + + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[EPerBlock, KPerBlock] is in LDS + // b_mtx[EPerBlocl, BPerBlock] is in LDS + // c_mtx[KPerBlock, BPerBlock] is distributed among threads, and saved in + // register + constexpr auto a_e_k_block_mtx_desc = + make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor(wei_e_k_block_desc); + + constexpr auto b_e_b_block_mtx_desc = + make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor(in_e_b_block_desc); + + // sanity check + static_assert( + KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && + BPerBlock % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, + "wrong!"); + + constexpr index_t GemmMRepeat = + KPerBlock / (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster); + + constexpr index_t GemmNRepeat = + BPerBlock / (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster); + + // c_thread_mtx definition: this is a mess + // TODO:: more elegent way of defining c_thread_mtx + constexpr auto c_k0k1_b0b1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); + + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< + BlockSize, + decltype(a_e_k_block_mtx_desc), + decltype(b_e_b_block_mtx_desc), + decltype(c_k0k1_b0b1_thread_mtx_desc), + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB>{}; + + // LDS allocation for input and weight: be careful of alignment + constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_B, + WeiBlockCopyDstDataPerWrite_K, + GemmDataPerReadA, + GemmDataPerReadB); + + constexpr index_t in_block_space = + math::integer_least_multiple(in_e_b_block_desc.GetElementSpace(), max_align); + + constexpr index_t wei_block_space = + math::integer_least_multiple(wei_e_k_block_desc.GetElementSpace(), max_align); + + __shared__ Float p_in_block[in_block_space]; + __shared__ Float p_wei_block[wei_block_space]; + + // register allocation for output + Float p_out_thread[c_k0k1_b0b1_thread_mtx_desc.GetElementSpace()]; + + // zero out threadwise output + threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread); + + const Float* p_wei_block_on_global = p_wei_global; + + for(index_t e_block_data_begin = 0; e_block_data_begin < E; e_block_data_begin += EPerBlock) + { + blockwise_in_copy.Run(p_in_global, p_in_block); + blockwise_wei_copy.Run(p_wei_global, p_wei_block); + + __syncthreads(); + + blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); + + __syncthreads(); + + blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true); + blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true); + } + + // copy output: register to global memory + { + constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; + + // define tensor descriptor for threadwise copy + // output global descriptor, for calculating origin of thread tensor + // in global memory + constexpr auto out_k_b_global_desc = make_ConstantMergedTensorDescriptor( + out_n_k_h_w_global_desc, Sequence<1>{}, Sequence<0, 2, 3>{}); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t k_thread_data_on_global = + k_block_data_on_global + c_thread_mtx_on_block.row; + + const index_t b_thread_data_on_global = + b_block_data_on_global + c_thread_mtx_on_block.col; + +#if 0 + // origin of dst in device memory + Float* p_out_thread_on_global = p_out_global + + out_k_b_global_desc.GetOffsetFromMultiIndex( + k_thread_data_on_global, b_thread_data_on_global); + + // dst descriptor + constexpr auto out_k0_k1_b0_b1_global_desc = + out_k_b_global_desc.Fold(I1, Number{}).Fold(I0, Number{}); + + // src descriptor + constexpr auto out_k0_k1_b0_b1_thread_desc = make_ConstantTensorDescriptor_packed( + Sequence{}); + + const auto threadwise_out_copy = + ThreadwiseGenericTensorSliceCopy_v2::type, + 1, + 1>({0, 0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global / B1, + b_thread_data_on_global % B1}); + + threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global); +#else + // This is a hack, because slicing a merged dimension is not supported yet. + // This should be replaced with logic above, once slicing a merged dimension support + // become available + // dst descriptor + constexpr auto out_k0_k1_b_global_desc = + make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number{}), + Sequence<1>{}, + Sequence<2>{}, + Sequence<0, 3, 4>{}); + + // src descriptor + constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed( + Sequence{}); + + auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2< + Float, + decltype(out_k0_k1_b_thread_desc), + decltype(out_k0_k1_b_global_desc), + NormalTensorCoordinate, + MergedTensorCoordinate, + Sequence>( + {0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global}); + + for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) + { + threadwise_out_copy.Run(p_out_thread, p_out_global); + + threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true); + threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true); + } +#endif + } + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index 9833ef3200..e353b060c6 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -49,17 +49,9 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr auto GetStrides() { return Strides{}; } - template - __host__ __device__ static constexpr auto GetLength(IDim) - { - return Lengths::Get(IDim{}); - } + __host__ __device__ static constexpr auto GetLength(index_t IDim) { return Lengths{}[IDim]; } - template - __host__ __device__ static constexpr auto GetStride(IDim) - { - return Strides::Get(IDim{}); - } + __host__ __device__ static constexpr auto GetStride(index_t IDim) { return Strides{}[IDim]; } struct lambda_AreDimensionsContinuous { diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp new file mode 100644 index 0000000000..4ab3894636 --- /dev/null +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -0,0 +1,329 @@ +#ifndef CK_TENSOR_COORDINATE_HPP +#define CK_TENSOR_COORDINATE_HPP + +#include "common_header.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" + +namespace ck { + +template +struct NormalTensorCoordinate +{ + using type = NormalTensorCoordinate; + using tensor_desc_type = TensorDesc; + + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + + __host__ __device__ constexpr NormalTensorCoordinate(Array tensor_index) + : mIndex{tensor_index}, mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} + { + } + + template + __host__ __device__ constexpr NormalTensorCoordinate(Xs... xs) + : NormalTensorCoordinate(Array{xs...}) + { + } + + __host__ __device__ constexpr Array GetIndex() const { return mIndex; } + + __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + + template + __host__ __device__ void + MoveOnDimension(IDim idim, index_t step_size, integral_constant) + { + if(PositiveDirection) + { + mIndex(idim) += step_size; + mOffset += step_size * tensor_desc_type::GetStride(idim); + } + else + { + mIndex(idim) -= step_size; + mOffset -= step_size * tensor_desc_type::GetStride(idim); + } + } + + // T is Array or Sequence + template + __host__ __device__ type operator+=(T step_sizes) + { +#if 0 + static_assert(is_same, "wrong!"); +#endif + static_assert(T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + }); + + return *this; + } + + template + __host__ __device__ type operator-=(T step_sizes) + { +#if 0 + static_assert(is_same, "wrong!"); +#endif + static_assert(T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + }); + + return *this; + } + + template + __host__ __device__ constexpr type operator+(T step_sizes) const + { + type coord = *this; + coord += step_sizes; + return coord; + } + + template + __host__ __device__ constexpr type operator-(T step_sizes) const + { + type coord = *this; + coord -= step_sizes; + return coord; + } + + // private: + Array mIndex; + index_t mOffset; +}; + +template +struct MergedTensorCoordinate +{ + using type = MergedTensorCoordinate; + using tensor_desc_type = TensorDesc; + + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + static constexpr index_t nOriginalDim = + tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); + + __host__ __device__ constexpr MergedTensorCoordinate(Array tensor_index) + : mIndex{tensor_index}, + mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} + { + // partial offset on each dimension + static_for<0, nDim, 1>{}([&](auto idim) { + constexpr auto partial_original_dims = + tensor_desc_type::GetContainedOriginalDimensions(idim); + + constexpr auto partial_original_desc = + tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); + + mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex( + extract_array(mOriginalIndex, partial_original_dims)); + }); + + // complete offset + mOffset = + accumulate_on_array(mPartialOffsets, math::plus{}, static_cast(0)); + } + + template + __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) + : MergedTensorCoordinate(Array{xs...}) + { + } + + __host__ __device__ constexpr Array GetIndex() const { return mIndex; } + + __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + + // step_size should be known at compile time + template + __host__ __device__ void + MoveOnDimension(IDim, index_t step_size, integral_constant) + { + constexpr auto idim = IDim{}; + + // update multi-index + if(PositiveDirection) + { + mIndex(idim) += step_size; + } + else + { + mIndex(idim) -= step_size; + } + + // update rest + static_if{}([&](auto) { + constexpr auto partial_original_dims = + tensor_desc_type::GetContainedOriginalDimensions(idim); + + constexpr index_t ndim_partial_original = partial_original_dims.GetSize(); + + constexpr auto partial_original_desc = + tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); + + const auto partial_original_step_sizes = + partial_original_desc.GetMultiIndexFrom1dIndex(step_size); + + // update partial original multi-id + auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims); + + static_if{}([&](auto) { + partial_original_id += partial_original_step_sizes; + + bool carry = false; + + // do carry check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, ndim_partial_original, 1>{}([&](auto IReverse) { + constexpr index_t i = ndim_partial_original - 1 - IReverse; + + if(carry) + { + ++partial_original_id(i); + } + + carry = false; + + if(partial_original_id[i] >= partial_original_desc.GetLength(i)) + { + partial_original_id(i) -= partial_original_desc.GetLength(i); + carry = true; + } + }); + }).Else([&](auto) { + // shift up multi-id to avoid unsigned integer underflow during intermediate + // calculations. After the shift, should have new_multi_id[...] >= 1 + partial_original_id += + partial_original_desc.GetLengths() - partial_original_step_sizes; + + bool borrow = false; + + // do borrow check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, ndim_partial_original, 1>{}([&](auto IReverse) { + constexpr index_t i = ndim_partial_original - 1 - IReverse; + + if(borrow) + { + --partial_original_id(i); + } + + borrow = false; + + if(partial_original_id[i] < partial_original_desc.GetLength(i)) + { + partial_original_id(i) += partial_original_desc.GetLength(i); + borrow = true; + } + }); + + // shift back down multi-id + // here, should have new_multi_id[...] >= GetLengths() + partial_original_id = partial_original_id - partial_original_desc.GetLengths(); + }); + + // update "mOriginalIndex" + static_for<0, ndim_partial_original, 1>{}([&](auto I) { + constexpr auto idim_original = partial_original_dims[I]; + + mOriginalIndex(idim_original) = partial_original_id[I]; + }); + + // calculate new partial offset on this merged dimension + const index_t old_partial_offset = mPartialOffsets[idim]; + + mPartialOffsets(idim) = + partial_original_desc.GetOffsetFromMultiIndex(partial_original_id); + + // update "mThreadSrcOffset", do "+" before "-" to avoid underflow + mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; + }).Else([&](auto) { + constexpr auto idim_original = + tensor_desc_type::GetContainedOriginalDimensions(idim).Front(); + + static_if{}([&](auto fwd) { + mOriginalIndex(idim_original) += step_size; + mPartialOffsets(idim) += step_size * fwd(tensor_desc_type{}).GetStride(idim); + mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); + }).Else([&](auto fwd) { + mOriginalIndex(idim_original) -= step_size; + mPartialOffsets(idim) -= step_size * fwd(tensor_desc_type{}).GetStride(idim); + mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); + }); + }); + } + + // T is Array or Sequence + template + __host__ __device__ type operator+=(T step_sizes) + { +#if 0 + static_assert(is_same, "wrong!"); +#endif + static_assert(T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + }); + + return *this; + } + + template + __host__ __device__ type operator-=(T step_sizes) + { +#if 0 + static_assert(is_same, "wrong!"); +#endif + static_assert(T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + }); + + return *this; + } + + template + __host__ __device__ constexpr type operator+(T step_sizes) const + { + type coord = *this; + coord += step_sizes; + return coord; + } + + template + __host__ __device__ constexpr type operator-(T step_sizes) const + { + type coord = *this; + coord -= step_sizes; + return coord; + } + + // private: + Array mIndex; + Array mOriginalIndex; + Array mPartialOffsets; // mPartialOffsets is needed for for unsigned index type + index_t mOffset; +}; + +#if 0 +// implementation of MergedTensorCoordinate, when index_t is signed integer +// mPartialOffsets is not needed, if index_t is signed integer type +template<> +struct TensorCoordinate +{ + private: + Array<_t, nDim> mIndex; + Array<_t, nOriginalDim> mOriginalIndex; + index_t mOffset; +}; +#endif + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 981e0ad8db..a6b4fb25e0 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_coordinate.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 @@ -373,6 +374,64 @@ struct BlockwiseGenericTensorSliceCopy_v1 } }; +template +struct BlockwiseGenericTensorSliceCopy_v2 +{ + using ThreadwiseCopy = ThreadwiseGenericTensorSliceCopy_v2; + + static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, + DstCoordinate dst_block_slice_origin) + { + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + DataClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + const auto thread_cluster_multi_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_multi_id = + reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{}; + + mThreadwiseCopy.SetSrcSliceOrigin(src_block_slice_origin + thread_data_multi_id_begin); + mThreadwiseCopy.SetDstSliceOrigin(dst_block_slice_origin + thread_data_multi_id_begin); + } + + __device__ void Run(const TData* p_src, TData* p_dst) const + { + mThreadwiseCopy.Run(p_src, p_dst); + } + + __device__ void MoveSrcSlicingWindow(Array step_sizes, bool positive_direction) + { + mThreadwiseCopy.MoveSrcSlicingWindow(step_sizes, positive_direction); + } + + __device__ void MoveDstSlicingWindow(Array step_sizes, bool positive_direction) + { + mThreadwiseCopy.MoveDstSlicingWindow(step_sizes, positive_direction); + } + + // private: + ThreadwiseCopy mThreadwiseCopy; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 5cff460050..d005cb84a2 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_coordinate.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 @@ -105,5 +106,75 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( #endif } +template +struct ThreadwiseGenericTensorSliceCopy_v2 +{ + static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2() + : mSrcSliceOrigin(make_zero_array()), + mDstSliceOrigin(make_zero_array()) + { + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2(SrcCoordinate src_slice_origin, + DstCoordinate dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + } + + __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + __device__ void Run(const TData* p_src, TData* p_dst) const + { + static_ford{}([&](auto data_id) { + p_dst[(mDstSliceOrigin + data_id).GetOffset()] = + p_src[(mSrcSliceOrigin + data_id).GetOffset()]; + + }); + } + + __device__ void MoveSrcSlicingWindow(Array step_sizes, bool positive_direction) + { + if(positive_direction) + { + mSrcSliceOrigin += step_sizes; + } + else + { + mSrcSliceOrigin -= step_sizes; + } + } + + __device__ void MoveDstSlicingWindow(Array step_sizes, bool positive_direction) + { + if(positive_direction) + { + mDstSliceOrigin += step_sizes; + } + else + { + mDstSliceOrigin -= step_sizes; + } + } + + // private: + SrcCoordinate mSrcSliceOrigin; + DstCoordinate mDstSliceOrigin; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/utility/Array.hpp b/composable_kernel/include/utility/Array.hpp index afe5b392f6..3b7bba8429 100644 --- a/composable_kernel/include/utility/Array.hpp +++ b/composable_kernel/include/utility/Array.hpp @@ -9,7 +9,8 @@ namespace ck { template struct Array { - using Type = Array; + using Type = Array; + using data_type = TData; static constexpr index_t nSize = NSize; @@ -20,7 +21,7 @@ struct Array { } - __host__ __device__ constexpr index_t GetSize() const { return NSize; } + __host__ __device__ static constexpr index_t GetSize() { return NSize; } template __host__ __device__ constexpr TData operator[](Number) const @@ -208,6 +209,21 @@ __host__ __device__ constexpr auto operator-(Array a, Array +__host__ __device__ constexpr auto operator+=(Array& a, Array b) +{ + a = a + b; + return a; +} + +// Array -= Array +template +__host__ __device__ constexpr auto operator-=(Array& a, Array b) +{ + a = a - b; + return a; +} // Array = Array + Sequence template __host__ __device__ constexpr auto operator+(Array a, Sequence b) diff --git a/composable_kernel/include/utility/Sequence.hpp b/composable_kernel/include/utility/Sequence.hpp index ec56211479..4e410964c9 100644 --- a/composable_kernel/include/utility/Sequence.hpp +++ b/composable_kernel/include/utility/Sequence.hpp @@ -12,7 +12,8 @@ struct is_valid_sequence_map; template struct Sequence { - using Type = Sequence; + using Type = Sequence; + using data_type = index_t; static constexpr index_t mSize = sizeof...(Is); diff --git a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp index 5669ea7243..6e9d240d02 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp @@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc, constexpr index_t InBlockCopyDataPerAccess_W2 = 4; - using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; - using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif constexpr index_t N0 = N / (N1 * N2); diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..1788af0cf1 --- /dev/null +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -0,0 +1,152 @@ +#pragma once +#include +#include "device.hpp" +#include "tensor.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" + +using namespace ck; + +template +void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + ConvStrides, + ConvDilations, + index_t nrepeat) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr index_t Hi = in_nchw_desc.GetLength(I2); + constexpr index_t Wi = in_nchw_desc.GetLength(I3); + + constexpr index_t N = out_nkhw_desc.GetLength(I0); + constexpr index_t Ho = out_nkhw_desc.GetLength(I2); + constexpr index_t Wo = out_nkhw_desc.GetLength(I3); + + constexpr index_t K = wei_kcyx_desc.GetLength(I0); + constexpr index_t C = wei_kcyx_desc.GetLength(I1); + constexpr index_t Y = wei_kcyx_desc.GetLength(I2); + constexpr index_t X = wei_kcyx_desc.GetLength(I3); + + std::size_t data_sz = sizeof(T); + DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_device_buf.ToDevice(in_nchw.mData.data()); + wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // 1x1 filter, 8x8 image + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_B = Sequence<4, 1>; + using InBlockCopyClusterLengths_E_B = Sequence<2, 128>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t InBlockCopyDataPerAccess_B = 1; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#endif + + constexpr index_t B = N * Ho * Wo; + + constexpr index_t GridSize = + ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(index_t i = 0; i < nrepeat; ++i) + { + constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw< + GridSize, + BlockSize, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + BPerBlock, + KPerBlock, + EPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + InBlockCopySubLengths_E_B, + InBlockCopyClusterLengths_E_B, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + InBlockCopyDataPerAccess_B, + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K>{}; + + float time = launch_kernel(run_gridwise_convolution_kernel, + dim3(GridSize), + dim3(BlockSize), + 0, + static_cast(in_nchw_device_buf.GetDeviceBuffer()), + static_cast(wei_kcyx_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + + printf("Elapsed time : %f ms, %f TFlop/s\n", + time, + (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / + (std::size_t(1000) * 1000 * 1000) / time); + usleep(std::min(time * 1000, float(10000))); + } + + out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); +} diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 125adf6b83..418f0d4a74 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -16,6 +16,7 @@ #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" struct GeneratorTensor_1 { @@ -71,13 +72,16 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 8; + constexpr index_t N = 2; constexpr index_t C = 16; - constexpr index_t HI = 3; - constexpr index_t WI = 18; + constexpr index_t HI = 8; + constexpr index_t WI = 8; constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; constexpr index_t HPad = 0; constexpr index_t WPad = 0; @@ -249,7 +253,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -265,7 +269,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% constexpr index_t N = 128; @@ -491,7 +495,7 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 +#if 0 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #elif 0 @@ -548,7 +552,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -558,6 +562,16 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); +#elif 1 + device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); #elif 0 device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc, in_nchw,