diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp index 83e2aa2642..81fef7dcbd 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp @@ -87,9 +87,6 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; - constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; - constexpr unsigned WeiBlockCopyThreadPerDim1 = 32; - constexpr unsigned InBlockCopy_ThreadPerDimC = 4; constexpr unsigned InBlockCopy_ThreadPerDimH = 4; constexpr unsigned InBlockCopy_ThreadPerDimW = 2; @@ -278,8 +275,6 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, KPerThread, HoPerThread, WoPerThread, - WeiBlockCopyThreadPerDim0, - WeiBlockCopyThreadPerDim1, Sequence #include #include +#include #include "config.h" #include "tensor.hpp" #include "ConstantTensorDescriptor.hip.hpp" @@ -378,7 +379,7 @@ void check_error(const Tensor& ref, const Tensor& result) std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; } -int main() +int main(int argc, char* argv[]) { #if 0 constexpr unsigned N = 1; @@ -571,7 +572,14 @@ int main() std::size_t num_thread = std::thread::hardware_concurrency(); - bool do_verification = true; + if(argc != 3) + { + printf("arg1: do_verification, arg2: nrepeat\n"); + exit(1); + } + + bool do_verification = atoi(argv[1]); + unsigned nrepeat = atoi(argv[2]); if(do_verification) { @@ -587,8 +595,6 @@ int main() #endif } - unsigned nrepeat = 200; - #if 1 #if 0 device_direct_convolution_1 diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp index cc65bb7a6e..cc8c08e8d5 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp @@ -23,8 +23,6 @@ template {}); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc"); - print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc"); - - print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc"); - - print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc"); - } -#endif - // blockwise copy // input: format is [C, Hi, Wi, N] -#if 0 - constexpr auto blockwise_in_copy = - Blockwise4dTensorCopy1{}; -#elif 1 - const auto blockwise_in_copy = Blockwise4dTensorCopy3{}; -#endif -// blockwise wei copy -// format is [CPerBlock*S*R,KPerBlock] -#if 0 - const auto blockwise_wei_copy = - Blockwise2dTensorCopy1{}; -#elif 0 - const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; -#elif 1 + // blockwise wei copy + // format is [CPerBlock*S*R,KPerBlock] const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; -#endif // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] + // A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] + // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( Number{}, Number{}, Number{}); @@ -185,23 +145,6 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric Number{}, Number{}); -#if 0 - const auto blockwise_batch_gemm = - Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; -#else const auto blockwise_batch_gemm = BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< BlockSize, decltype(a_cxk_block_mtx_desc), @@ -219,7 +162,6 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric GemmNLevel1Cluster, GemmKPerThreadLoop, HoPerThread>{}; -#endif // LDS: be careful of alignment constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -277,26 +219,6 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - // for v1 batch-gemm - const unsigned k_thread_data_begin = c_thread_mtx_begin.row; - const unsigned ho_thread_data_begin = c_thread_mtx_begin.batch; - const unsigned wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; - const unsigned n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - - threadwise_4d_tensor_copy_v2( - out_khwn_thread_desc, - p_out_thread, - out_khwn_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_khwn_thread_desc.GetLengths(), - Number{}); -#elif 0 - const auto c_thread_mtx_begin = - blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - for(unsigned k = 0; k < out_khwn_thread_desc.GetLength(I0); ++k) { for(unsigned ho = 0; ho < out_khwn_thread_desc.GetLength(I1); ++ho) @@ -334,7 +256,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric const unsigned k_thread_data_begin = c_thread_mtx_begin.row; const unsigned ho_thread_data_begin = c_thread_mtx_begin.batch; const unsigned wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; - const unsigned n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; + const unsigned n_thread_data_begin = c_thread_mtx_begin.col - NPerBlock * wo_thread_data_begin; // this is for v2 GEMM // output is a 8d tensor @@ -375,6 +297,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric } else if(NPerThread == NPerBlock) { + // not implemented yet + assert(false); } else {