diff --git a/CMakeLists.txt b/CMakeLists.txt index 12bf7ae562..c3c0eab529 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,10 @@ link_libraries(${OpenMP_pthread_LIBRARY}) find_package(HIP REQUIRED) message(STATUS "Build with HIP ${hip_VERSION}") +## half +#find_path(HALF_INCLUDE_DIR half.hpp) +#message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}") + ## tidy include(EnableCompilerWarnings) set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp index 3a8883b460..09b39ab2ca 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp @@ -203,9 +203,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 __host__ __device__ static constexpr auto MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc) { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - constexpr auto xdlops_gemm = XdlopsGemm{}; constexpr auto CLayout = xdlops_gemm.GetCLayout(); @@ -217,7 +214,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat); constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat); - constexpr auto N0 = Number{}; constexpr auto N1 = Number{}; const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor( @@ -277,8 +273,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3 p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize()); const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - const auto M = a_k0_m_k1_grid_desc.GetLength(I1); - const auto N = b_k0_n_k1_grid_desc.GetLength(I1); // divide block work by [M, N] const auto block_work_idx = diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp index 49e0223b33..187a05554b 100644 --- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp @@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); @@ -319,16 +314,13 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyx const auto K = out_n_ho_wo_k_lengths[I3]; const auto C = wei_k_y_x_c_lengths[I3]; - const auto Hi = in_n_hi_wi_c_lengths[I1]; - const auto Wi = in_n_hi_wi_c_lengths[I2]; - const auto Ho = out_n_ho_wo_k_lengths[I1]; const auto Wo = out_n_ho_wo_k_lengths[I2]; const auto Y = wei_k_y_x_c_lengths[I1]; const auto X = wei_k_y_x_c_lengths[I2]; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + float perf = static_cast((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" diff --git a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp index ce4dd155f6..e1c6db8045 100644 --- a/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp @@ -35,11 +35,6 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); @@ -304,7 +299,7 @@ void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_k const auto Y = wei_k_y_x_c_lengths[I1]; const auto X = wei_k_y_x_c_lengths[I2]; - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + float perf = static_cast((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) / (std::size_t(1000) * 1000 * 1000) / ave_time; std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" diff --git a/host/driver_offline/src/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp index 3d1faaaf66..828bbae2bd 100644 --- a/host/driver_offline/src/conv_bwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp @@ -277,8 +277,6 @@ int main(int argc, char* argv[]) in_right_pads_dev); }; - const auto nhwc_desc = f_make_for_device_nhwc(); - #if USE_CONV_BWD_V4R1_XDL_NHWC if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC) { diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 4aac2b5e4f..42a2d2f681 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -20,12 +20,12 @@ #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" #define USE_DYNAMIC_MODE 1 -#define USE_CONV_FWD_V4R4_NCHW 1 -#define USE_CONV_FWD_V4R4R2_NHWC 1 -#define USE_CONV_FWD_V6R1_NCHW 1 +#define USE_CONV_FWD_V4R4_NCHW 0 +#define USE_CONV_FWD_V4R4R2_NHWC 0 +#define USE_CONV_FWD_V6R1_NCHW 0 #define USE_CONV_FWD_V5R1_NCHW 0 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0 -#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0 +#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1 enum ConvForwardAlgo { diff --git a/host/host_tensor/CMakeLists.txt b/host/host_tensor/CMakeLists.txt index 9c30275220..90249fc664 100644 --- a/host/host_tensor/CMakeLists.txt +++ b/host/host_tensor/CMakeLists.txt @@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE ## the library target add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) +#target_include_directories(host_tensor SYSTEM PUBLIC $) + target_link_libraries(host_tensor PRIVATE hip::device) target_link_libraries(host_tensor INTERFACE hip::host) diff --git a/host/host_tensor/include/host_conv_bwd_data.hpp b/host/host_tensor/include/host_conv_bwd_data.hpp index 07617c3926..ca23422e23 100644 --- a/host/host_tensor/include/host_conv_bwd_data.hpp +++ b/host/host_tensor/include/host_conv_bwd_data.hpp @@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor& in, const ConvStrides& conv_strides, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, + const InRightPads& /* in_right_pads */, const ConvTensorLayout layout = ConvTensorLayout::NCHW) { using namespace ck; @@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor& in, constexpr auto I3 = Number<3>{}; auto f_nchw = [&](auto n, auto c, auto hi, auto wi) { - std::size_t N = in.mDesc.GetLengths()[I0]; - std::size_t C = in.mDesc.GetLengths()[I1]; - std::size_t Hi = in.mDesc.GetLengths()[I2]; - std::size_t Wi = in.mDesc.GetLengths()[I3]; - std::size_t K = wei.mDesc.GetLengths()[I0]; std::size_t Y = wei.mDesc.GetLengths()[I2]; std::size_t X = wei.mDesc.GetLengths()[I3]; @@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor& in, }; auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) { - std::size_t N = in.mDesc.GetLengths()[I0]; - std::size_t Hi = in.mDesc.GetLengths()[I1]; - std::size_t Wi = in.mDesc.GetLengths()[I2]; - std::size_t C = in.mDesc.GetLengths()[I3]; - std::size_t K = wei.mDesc.GetLengths()[I0]; std::size_t Y = wei.mDesc.GetLengths()[I1]; std::size_t X = wei.mDesc.GetLengths()[I2]; @@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor& in, in(n, hi, wi, c) = v; }; - switch(layout) + if(layout == ConvTensorLayout::NCHW) { - case ConvTensorLayout::NCHW: make_ParallelTensorFunctor(f_nchw, in.mDesc.GetLengths()[0], in.mDesc.GetLengths()[1], in.mDesc.GetLengths()[2], in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); - break; - case ConvTensorLayout::NHWC: + } + else if(layout == ConvTensorLayout::NHWC) + { make_ParallelTensorFunctor(f_nhwc, in.mDesc.GetLengths()[0], in.mDesc.GetLengths()[1], in.mDesc.GetLengths()[2], in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); - break; - default: throw std::runtime_error("wrong! not supported layout"); + } + else + { + throw std::runtime_error("wrong! not supported layout"); } }