From 9bdc6209c4a058e89d95d09f51d524241e43cc2f Mon Sep 17 00:00:00 2001 From: cloudhan Date: Fri, 18 Aug 2023 11:14:59 +0800 Subject: [PATCH] Use asynchronous version of hipMemset (#850) [ROCm/composable_kernel commit: d52ec01652b7d620386251db92455968d8d90bdc] --- ...conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 5 +++-- .../gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp | 6 ++++-- .../gpu/device/impl/device_gemm_xdl_streamk.hpp | 7 +++++-- .../impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 6 ++++-- .../device_splitk_contraction_multiple_d_xdl_cshuffle.hpp | 5 +++-- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index 1e50e1ae89..23440e24f6 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -532,11 +532,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ float ave_time = 0; const auto Run = [&](const auto& kernel) { - hipGetErrorString(hipMemset( + hipGetErrorString(hipMemsetAsync( arg.p_c_grid_, 0, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(CDataType))); + sizeof(CDataType), + stream_config.stream_id_)); ave_time = launch_and_time_kernel(stream_config, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp index 2b8f760737..505e1846d1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp @@ -158,8 +158,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK 1) - hipGetErrorString( - hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType))); + hipGetErrorString(hipMemsetAsync(karg.p_c_grid, + 0, + karg.M * karg.N * sizeof(CDataType), + stream_config.stream_id_)); ave_time = launch_and_time_kernel( stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp index 6fe5784c28..8de42ba9ef 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -147,7 +147,10 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK; - hipGetErrorString(hipMemset( + hipGetErrorString(hipMemsetAsync( arg.p_e_grid_, 0, arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(EDataType))); + sizeof(EDataType), + stream_config.stream_id_)); return launch_and_time_kernel(stream_config, kernel,