diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e757ef048..12bf7ae562 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,12 +142,11 @@ enable_clang_tidy( -cppcoreguidelines-prefer-member-initializer ${MIOPEN_TIDY_CHECKS} - ${MIOPEN_TIDY_ERRORS} + ${MIOPEN_TIDY_ERRORS} HEADER_FILTER "\.hpp$" EXTRA_ARGS -DMIOPEN_USE_CLANG_TIDY - ) include(CppCheck) diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp index 694cf9c6a3..f021a7b9b4 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp @@ -71,7 +71,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 static constexpr index_t N0 = N / N1; __host__ __device__ static constexpr auto - MakeAKM0M1BlockDescriptor(const AKMBlockDesc& a_k_m_block_desc) + MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */) { const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor( AKMBlockDesc{}, @@ -84,7 +84,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 } __host__ __device__ static constexpr auto - MakeBKN0N1BlockDescriptor(const BKNBlockDesc& b_k_n_block_desc) + MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */) { const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor( BKNBlockDesc{}, @@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer> - __device__ void Run(const CM0M1N0N1ThreadDesc& c_m0_m1_n0_n1_thread_desc, + __device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */, const ABlockBuffer& a_block_buf, const BBlockBuffer& b_block_buf, CThreadBuffer& c_thread_buf) const diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp index 25b2ba7ce8..b656b4595a 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp @@ -120,9 +120,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 "wrong! inconsistent type"); constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; constexpr auto a_block_mtx = BlockMatrixA{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp index 74c9dc1547..715fbc0b41 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp @@ -270,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline const index_t waveId = thread_id / WaveSize; const index_t laneId = thread_id % WaveSize; const index_t waveId_m = waveId / NWaves; - const index_t waveId_n = waveId % NWaves; if constexpr(xdlops_gemm.IsKReduction) { diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp index 7a4ef1d7ea..2c45e42a0e 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp @@ -619,17 +619,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2 // output: register to global memory { - constexpr index_t M11 = - M1PerThreadM111 * M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101; - constexpr index_t N11 = - N1PerThreadN111 * M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101; - - constexpr index_t M10 = MPerBlockM1 / M11; - constexpr index_t N10 = NPerBlockN1 / N11; - - constexpr index_t M111 = M1PerThreadM111; - constexpr index_t N111 = N1PerThreadN111; - constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( make_tuple(I1, diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp index db3cb99121..5de41b1f7c 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp @@ -191,12 +191,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 const auto M = a_k0_m_k1_grid_desc.GetLength(I1); const auto N = b_k0_n_k1_grid_desc.GetLength(I1); const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - const auto K1 = a_k0_m_k1_grid_desc.GetLength(I2); // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) && K0 == b_k0_n_k1_grid_desc.GetLength(I0) && + K1 == a_k0_m_k1_grid_desc.GetLength(I2) && K1 == b_k0_n_k1_grid_desc.GetLength(I2)) && (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0); } @@ -608,19 +608,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3 // output: register to global memory { - constexpr auto M11 = - Number{}; - constexpr auto N11 = - Number{}; - - constexpr index_t M10 = MPerBlockM1 / M11; - constexpr index_t N10 = NPerBlockN1 / N11; - - constexpr index_t M111 = M1PerThreadM111; - constexpr index_t N111 = N1PerThreadN111; - constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( make_tuple(I1, diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp index 34dea34833..5e90e0e85d 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp @@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 // divide block work by [M, N] #if 0 - const auto k_block_work_num = K / Number{}; const auto ho_block_work_num = Ho / Number{}; const auto wo_block_work_num = Wo / Number{}; const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num; @@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3 const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num; #else // Hack: this force result into SGPR - const index_t k_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock); const index_t ho_block_work_num = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock); const index_t wo_block_work_num = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock); const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num; diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp index a5b1de79a7..124623c702 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp @@ -269,11 +269,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc, const CBlockClusterAdaptor& c_block_cluster_adaptor) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp index 153d512df7..f9d8ac05b6 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp @@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; constexpr auto E = ADesc{}.GetLength(I0); constexpr auto K = ADesc{}.GetLength(I1); diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp index 24ba775309..c044036a2c 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp @@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); @@ -198,8 +192,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks, nrepeat); - float perf = (float)calculate_convolution_flops( - in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) / + float perf = static_cast(calculate_convolution_flops( + in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl; diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp index cdd1084c0d..ce94f2071b 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp @@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); @@ -271,7 +266,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw const auto Y = wei_k_y_x_c_lengths[I1]; const auto X = wei_k_y_x_c_lengths[I2]; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + float perf = static_cast(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp index b56cbc0335..514ff6a3a9 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp @@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); @@ -194,8 +188,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks, nrepeat); - float perf = (float)calculate_convolution_flops( - in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) / + float perf = static_cast(calculate_convolution_flops( + in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl; diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp index 601878c347..8596630ad3 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp @@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); @@ -352,7 +347,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh const auto Y = wei_k_y_x_c_lengths[I1]; const auto X = wei_k_y_x_c_lengths[I2]; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + float perf = static_cast((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp index ca0d47c33a..583c8a8a79 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp @@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( const Tensor& in_n_c_hi_wi, const Tensor& wei_k_c_y_x, Tensor& out_n_k_ho_wo, - ck::index_t nrepeat) + ck::index_t /* nrepeat */) { using namespace ck; diff --git a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp index 8fb276b464..9edbb811ca 100644 --- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -232,8 +232,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( in_grid_move_slice_window_iterator_hacks, nrepeat); - float perf = (float)calculate_convolution_flops( - in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) / + float perf = static_cast(calculate_convolution_flops( + in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl; diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp index 7c4b1043f3..34b9a54374 100644 --- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp @@ -338,10 +338,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k0_ho_wo_k1_global_desc) / - (std::size_t(1000) * 1000 * 1000) / ave_time; + float perf = + static_cast(calculate_convolution_flops(in_n_c_hi_wi_global_desc, + wei_k_c_y_x_global_desc, + out_n_k0_ho_wo_k1_global_desc)) / + (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl; diff --git a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp index b7f8e6039c..4e0f6e9f77 100644 --- a/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp +++ b/host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp @@ -354,10 +354,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp float ave_time = timer.GetElapsedTime() / nrepeat; - float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, - wei_k_c_y_x_global_desc, - out_n_k0_ho_wo_k1_global_desc) / - (std::size_t(1000) * 1000 * 1000) / ave_time; + float perf = + static_cast(calculate_convolution_flops(in_n_c_hi_wi_global_desc, + wei_k_c_y_x_global_desc, + out_n_k0_ho_wo_k1_global_desc)) / + (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl; diff --git a/host/driver_offline/src/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp index c674ee5965..3d1faaaf66 100644 --- a/host/driver_offline/src/conv_bwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp @@ -128,10 +128,8 @@ int main(int argc, char* argv[]) std::vector in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); - switch(layout) + if(layout == ConvTensorLayout::NCHW) { - case ConvTensorLayout::NCHW: - // NCHW in_lengths_host[0] = static_cast(N); in_lengths_host[1] = static_cast(C); in_lengths_host[2] = static_cast(Hi); @@ -144,9 +142,9 @@ int main(int argc, char* argv[]) out_lengths_host[1] = static_cast(K); out_lengths_host[2] = static_cast(Ho); out_lengths_host[3] = static_cast(Wo); - break; - case ConvTensorLayout::NHWC: - // NHWC + } + else if(layout == ConvTensorLayout::NHWC) + { in_lengths_host[0] = static_cast(N); in_lengths_host[1] = static_cast(Hi); in_lengths_host[2] = static_cast(Wi); @@ -159,8 +157,10 @@ int main(int argc, char* argv[]) out_lengths_host[1] = static_cast(Ho); out_lengths_host[2] = static_cast(Wo); out_lengths_host[3] = static_cast(K); - break; - default: throw std::runtime_error("wrong! not implemented"); + } + else + { + throw std::runtime_error("wrong! not implemented"); } Tensor in_host(in_lengths_host); diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 7e4eb0571c..54392f3926 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -467,7 +467,6 @@ int main(int argc, char* argv[]) check_error(out_host, out_device); -#if 0 if(do_log) { LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; @@ -475,6 +474,5 @@ int main(int argc, char* argv[]) LogRangeAsType(std::cout << "out_host : ", out_host.mData, ",") << std::endl; LogRangeAsType(std::cout << "out_device: ", out_device.mData, ",") << std::endl; } -#endif } } diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp index 73126b3c79..ca95c1f138 100644 --- a/host/host_tensor/include/conv_common.hpp +++ b/host/host_tensor/include/conv_common.hpp @@ -62,7 +62,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor( template constexpr std::size_t -calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc) +calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc) { using namespace ck; diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp index 7f26cb42f7..c1228f4832 100644 --- a/host/host_tensor/include/host_conv.hpp +++ b/host/host_tensor/include/host_conv.hpp @@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor& in, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, + const InRightPads&, const ConvTensorLayout layout = ConvTensorLayout::NCHW) { using namespace ck; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { double v = 0; @@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor& in, out(n, ho, wo, k) = v; }; - switch(layout) + if(layout == ConvTensorLayout::NCHW) { - case ConvTensorLayout::NCHW: make_ParallelTensorFunctor(f_nchw, out.mDesc.GetLengths()[0], out.mDesc.GetLengths()[1], out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); - break; - case ConvTensorLayout::NHWC: + } + else if(layout == ConvTensorLayout::NHWC) + { make_ParallelTensorFunctor(f_nhwc, out.mDesc.GetLengths()[0], out.mDesc.GetLengths()[1], out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); - break; - default: throw std::runtime_error("wrong! not supported layout"); + } + else + { + throw std::runtime_error("wrong! not supported layout"); } } @@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, constexpr std::size_t HoPerTile = 2; constexpr std::size_t WoPerTile = 2; - std::size_t N = in_nchw.mDesc.GetLengths()[0]; - std::size_t C = in_nchw.mDesc.GetLengths()[1]; - std::size_t HI = in_nchw.mDesc.GetLengths()[2]; - std::size_t WI = in_nchw.mDesc.GetLengths()[3]; + std::size_t N = in_nchw.mDesc.GetLengths()[0]; + std::size_t C = in_nchw.mDesc.GetLengths()[1]; std::size_t K = wei_kcyx.mDesc.GetLengths()[0]; std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; - std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; - std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; + std::size_t Ho = out_nkhw.mDesc.GetLengths()[2]; + std::size_t Wo = out_nkhw.mDesc.GetLengths()[3]; index_t h_pad_low = InLeftPads{}.Get(Number<0>{}); index_t w_pad_low = InLeftPads{}.Get(Number<1>{}); @@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, std::size_t HiPerTile = HoPerTile + Y - 1; std::size_t WiPerTile = WoPerTile + X - 1; - std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; - std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; + std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile; + std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile; Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); diff --git a/host/host_tensor/include/host_tensor_generator.hpp b/host/host_tensor/include/host_tensor_generator.hpp index 98192e066f..7c09843d01 100644 --- a/host/host_tensor/include/host_tensor_generator.hpp +++ b/host/host_tensor/include/host_tensor_generator.hpp @@ -9,7 +9,7 @@ struct GeneratorTensor_1 int value = 1; template - float operator()(Is... is) + float operator()(Is...) { return value; } diff --git a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp index 983e0f0b74..c1bd754750 100644 --- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -99,40 +99,48 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw // clang-format on } - ck::DataTypeEnum_t ABDataTypeEnum; - ck::DataTypeEnum_t AccDataTypeEnum; - ck::DataTypeEnum_t CDataTypeEnum; + ck::DataTypeEnum_t ABDataTypeEnum = ck::DataTypeEnum_t::Unknown; + ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown; + ck::DataTypeEnum_t CDataTypeEnum = ck::DataTypeEnum_t::Unknown; - int BlockSize; + int BlockSize = 1; - int GN0; - int GK1; + int GN0 = -1; + int GK1 = -1; - int GM1PerBlockGM11; - int GN1PerBlockGN11; - int GK0PerBlock; + int GM1PerBlockGM11 = -1; + int GN1PerBlockGN11 = -1; + int GK0PerBlock = -1; - int BM1PerThreadBM11; - int BN1PerThreadBN11; - int BK0PerThread; + int BM1PerThreadBM11 = -1; + int BN1PerThreadBN11 = -1; + int BK0PerThread = -1; - std::array BM10BN10ThreadClusterBM10Xs; - std::array BM10BN10ThreadClusterBN10Xs; + std::array BM10BN10ThreadClusterBM10Xs = {-1, -1}; + std::array BM10BN10ThreadClusterBN10Xs = {-1, -1}; - std::array ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1; - std::array ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1; - std::array ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1; - std::array ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1; + std::array ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = { + -1, -1, -1, -1, -1}; + std::array ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = { + -1, -1, -1, -1, -1}; + std::array ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = { + -1, -1, -1, -1, -1}; + std::array ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = { + -1, -1, -1, -1, -1}; - std::array BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1; - std::array BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1; - std::array BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1; - std::array BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1; + std::array BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = { + -1, -1, -1, -1, -1}; + std::array BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = { + -1, -1, -1, -1, -1}; + std::array BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = { + -1, -1, -1, -1, -1}; + std::array BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = { + -1, -1, -1, -1, -1}; - int CThreadTransferDstScalarPerVector; + int CThreadTransferDstScalarPerVector = -1; - bool HasMainKBlockLoop; - bool HasDoubleTailKBlockLoop; + bool HasMainKBlockLoop = false; + bool HasDoubleTailKBlockLoop = false; }; struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw