From 701b7341f0e3b903a561cf1f287633504fbeb0ae Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 7 Aug 2019 19:25:54 -0500 Subject: [PATCH] clean up --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 24 +--- .../blockwise_generic_tensor_slice_copy.hpp | 113 +++++------------- .../threadwise_generic_tensor_slice_copy.hpp | 98 --------------- driver/src/driver.cpp | 2 +- 4 files changed, 38 insertions(+), 199 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 66fe01c71a..9747bf5e55 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -155,13 +155,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); -#if 0 +#if 1 // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1{}, Number{}); -#if 0 +#if 1 // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, // True); @@ -356,7 +354,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 0 +#if 1 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); @@ -436,19 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 0 - ThreadwiseGenericTensorSliceCopy_v1r1< - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), - arithmetic_sequence_gen<0, 8, 1>::type, - arithmetic_sequence_gen<0, 8, 1>::type, - 7, - 7, - 1, - 1>(make_zero_array(), make_zero_array()) - .Run(p_out_thread, p_out_thread_on_global); -#elif 0 +#if 1 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 7473194177..b20d2fd4d4 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -237,47 +237,23 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); #endif -// By position the origin of the per-thread window at the point, where multi-index -// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy -// is assuming each thread is copy a noraml (not merged) tensor. -// To satisfy this assumption, the user need to make sure that, on a merged dimension -// that constains multiple original dimensions, the length of the last original -// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on -// the merged dimension need to be 1. These sanity checks are performed in constructor -// of BlockwiseGenericTensorSliceCopy_v1 -#if 0 - threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, - p_src + src_offset + mThreadSrcOffset, - make_zero_array(), - thread_buffer_desc, - p_buffer + buffer_offset, - make_zero_array(), - thread_sub_tensor_lengths, - SrcDimAccessOrder{}, - Number{}); -#elif 1 - ThreadwiseGenericTensorSliceCopy_v1r1< - SrcDesc, - decltype(thread_buffer_desc), - SubLengths, - SrcDimAccessOrder, - typename arithmetic_sequence_gen<0, nDim, 1>::type, - SrcVectorAccessDim, - 0, - SrcDataPerAccess, - 1>(make_zero_array(), make_zero_array()) + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // To satisfy this assumption, the user need to make sure that, on a merged dimension + // that constains multiple original dimensions, the length of the last original + // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on + // the merged dimension need to be 1. These sanity checks are performed in constructor + // of BlockwiseGenericTensorSliceCopy_v1 + ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); -#elif 1 - ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) - .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); -#endif }); } @@ -316,48 +292,23 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); #endif -// By position the origin of the per-thread window at the point, where multi-index -// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy -// is assuming each thread is copy a noraml (not merged) tensor. -// To satisfy this assumption, the user need to make sure that, on a merged dimension -// that constains multiple original dimensions, the length of the last original -// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on -// the merged dimension need to be 1. These sanity checks are performed in constructor -// of BlockwiseGenericTensorSliceCopy_v1 -#if 0 - threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc, - p_buffer + buffer_offset, - make_zero_array(), - DstDesc{}, - p_dst + dst_offset + mThreadDstOffset, - make_zero_array(), - thread_sub_tensor_lengths, - DstDimAccessOrder{}, - Number{}); -#elif 1 - ThreadwiseGenericTensorSliceCopy_v1r1< - decltype(thread_buffer_desc), - DstDesc, - SubLengths, - typename arithmetic_sequence_gen<0, nDim, 1>::type, - DstDimAccessOrder, - 0, - DstVectorAccessDim, - 1, - DstDataPerAccess>(make_zero_array(), - make_zero_array()) + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // To satisfy this assumption, the user need to make sure that, on a merged dimension + // that constains multiple original dimensions, the length of the last original + // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on + // the merged dimension need to be 1. These sanity checks are performed in constructor + // of BlockwiseGenericTensorSliceCopy_v1 + ThreadwiseGenericTensorSliceCopy_v1r2( + make_zero_array(), make_zero_array()) .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); -#elif 1 - ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) - .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); -#endif }); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 5354cdd61c..9ab18f4f3a 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -6,10 +6,6 @@ #include "ConstantMergedTensorDescriptor.hpp" #include "tensor_coordinate.hpp" -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 -#endif - #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #endif @@ -24,100 +20,6 @@ namespace ck { -// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 -template -__device__ void threadwise_generic_tensor_slice_copy_v1( - SrcDesc, - const Float* __restrict__ p_src, - Array src_multi_id_begin, - DstDesc, - Float* __restrict__ p_dst, - Array dst_multi_id_begin, - SliceLengths, - DimAccessOrder, - Number) -{ - constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); - - // TODO: do more sanity-check here, something like: - // constexpr auto src_strides_in_access_order = - // SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); - - // constexpr auto dst_strides_in_access_order = - // SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); - - // // check src/dst stride on the lowest access dimension - // static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) && - // (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1), - // "wrong! src/dst stride on the lowest access dimension needs to be 1 for " - // "vectorized read/write"); - - constexpr auto slice_lengths_in_access_order = - SliceLengths::ReorderGivenNew2Old(DimAccessOrder{}); - - // check slice length on the lowest access dimension - static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0, - "wrong! slice length on the lowest access dimension should be evenly divided by " - "DataPerAccess"); - - constexpr index_t num_access_on_lowest_access_dimension = - slice_lengths_in_access_order.Back() / DataPerAccess; - - constexpr auto access_lengths = slice_lengths_in_access_order.Modify( - Number{}, Number{}); - - using vector_t = typename vector_type::MemoryType; - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 - static_ford{}([&](auto access_multi_id) { - constexpr index_t itmp = access_multi_id.Back() * DataPerAccess; - - constexpr auto data_multi_id_in_access_order = - access_multi_id.Modify(Number{}, Number{}); - - constexpr auto data_multi_id = - data_multi_id_in_access_order.ReorderGivenOld2New(DimAccessOrder{}); - - const index_t src_index = - SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - - const index_t dst_index = - DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); - - *reinterpret_cast(&p_dst[dst_index]) = - *reinterpret_cast(&p_src[src_index]); - }); -#else - ford{}([&](auto access_multi_id) { - auto data_multi_id_in_access_order = access_multi_id; - data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess; - - const auto data_multi_id = - reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); - - const index_t src_index = - SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - - const index_t dst_index = - DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); - - *reinterpret_cast(&p_dst[dst_index]) = - *reinterpret_cast(&p_src[src_index]); - }); -#endif -} - // This threadwise copy allow vector access of src and dst. // It allows the dimensions of vector access to be different on src and dst. // It also allows the vector size to be different on src and dst. diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index ea9f29b2e5..2f9a2c3e76 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,