diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 99a2ec9ee3..f74b05e750 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -87,7 +87,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW constexpr index_t InBlockReorderDataPerWrite_N = 1; - using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + using WeiBlockCopyClusterLengths = void; constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; @@ -122,7 +122,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load NCHW constexpr index_t InBlockReorderDataPerWrite_N = 2; - using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + using WeiBlockCopyClusterLengths = void; constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 4; @@ -136,10 +136,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t HoPerBlock = 4; constexpr index_t WoPerBlock = 8; - constexpr index_t NPerThread = 2; + constexpr index_t NPerThread = 4; constexpr index_t KPerThread = 8; constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 4; + constexpr index_t WoPerThread = 2; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -155,14 +155,14 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 4, 8>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 1; + constexpr index_t InBlockReorderDataPerWrite_N = 4; - using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + using WeiBlockCopyClusterLengths = void; constexpr index_t WeiBlockCopyDataPerRead_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_W = 1; -#elif 0 - // for 3x3, 28x28, v1r2, Pascal + constexpr index_t OutThreadCopyDataPerWrite_W = 2; +#elif 1 + // for 3x3, 28x28, v1r3, Pascal constexpr index_t BlockSize = 128; constexpr index_t NPerBlock = 16; @@ -186,13 +186,13 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>; + using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>; using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 2; - constexpr index_t InBlockReorderDataPerWrite_N = 4; + constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load NCHW + constexpr index_t InBlockReorderDataPerWrite_N = 4; - using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>; + using WeiBlockCopyClusterLengths = void; constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index fd6a3bbf8d..f25b99ca27 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -371,7 +371,7 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -413,13 +413,13 @@ int main(int argc, char* argv[]) { #if 1 // 3x3, 34x34 - constexpr index_t N = 64; - constexpr index_t C = 256; + constexpr index_t N = 64; + constexpr index_t C = 256; constexpr index_t HI = 34; constexpr index_t WI = 34; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; constexpr index_t HPad = 0; constexpr index_t WPad = 0; @@ -597,6 +597,8 @@ int main(int argc, char* argv[]) }; wei_kcyx.GenerateTensorValue(gen_wei, num_thread); #endif + + // out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread); } #if 1 diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index dcafa0f4c8..2a85725a50 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -359,19 +359,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; -#if 0 - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); -#else constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(Sequence{}); -#endif constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); @@ -401,20 +387,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw } #endif -#if 0 - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); -#else - constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 6, 3, 4, 5>{}; + constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 3, 4, 5, 6>{}; threadwise_nd_tensor_copy_reorder_given_dst2src_v2( out_10d_thread_desc, @@ -428,8 +401,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); -// Number{}); -#endif + // Number{}); }) .else_([&](auto f_dummy) { static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && @@ -446,19 +418,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; -#if 0 - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); -#else constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(Sequence{}); -#endif constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); @@ -486,26 +444,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw "out_k_h_w_n_global_desc"); print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - for(index_t i = 0; i < 64; ++i) - { - printf("out %f, ", p_out_thread[i]); - } } #endif -#if 0 - threadwise_nd_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); -#else constexpr auto map_out_global2thread = Sequence<8, 9, 0, 1, 2, 3, 4, 5, 6, 7>{}; threadwise_nd_tensor_copy_reorder_given_dst2src_v2( @@ -520,8 +461,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); -// Number{}); -#endif + // Number{}); }); } };