From 527073c38b9e848a3d0b68623504e2625d89ecae Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Thu, 15 Jun 2023 23:13:59 +0800 Subject: [PATCH] Using number of compute units to set gridSize (#754) * Add getAvailableComputeUnitCount() interface * Use available number of compute units to set kernel grid size [ROCm/composable_kernel commit: c5f6ec842c83a386b78dc0098fcdf081586df309] --- include/ck/host_utility/stream_utility.hpp | 43 ++++++++++++ .../impl/device_elementwise_2d_impl.hpp | 69 +++++++++---------- .../device/impl/device_elementwise_impl.hpp | 43 ++++++------ 3 files changed, 97 insertions(+), 58 deletions(-) create mode 100644 include/ck/host_utility/stream_utility.hpp diff --git a/include/ck/host_utility/stream_utility.hpp b/include/ck/host_utility/stream_utility.hpp new file mode 100644 index 0000000000..ef05f2e26b --- /dev/null +++ b/include/ck/host_utility/stream_utility.hpp @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/stream_config.hpp" +#include "ck/host_utility/hip_check_error.hpp" + +static int getAvailableComputeUnitCount(const StreamConfig& stream_config) +{ + constexpr int MAX_MASK_DWORDS = 64; + + // assume at most 64*32 = 2048 CUs + uint32_t cuMask[MAX_MASK_DWORDS]; + + for(int i = 0; i < MAX_MASK_DWORDS; i++) + cuMask[i] = 0; + + auto countSetBits = [](uint32_t dword) { + int count = 0; + + while(dword != 0) + { + if(dword & 0x1) + count++; + + dword = dword >> 1; + }; + + return (count); + }; + + hip_check_error(hipExtStreamGetCUMask(stream_config.stream_id_, MAX_MASK_DWORDS, &cuMask[0])); + + int ret = 0; + + for(int i = 0; i < MAX_MASK_DWORDS; i++) + ret += countSetBits(cuMask[i]); + + return (ret); +}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp index c5f90e40fe..02ef29e32d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp @@ -13,6 +13,7 @@ #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/host_utility/kernel_launch.hpp" +#include "ck/host_utility/stream_utility.hpp" namespace ck { namespace tensor_operation { @@ -171,10 +172,7 @@ struct DeviceElementwise2dImpl : public DeviceElementwise 0, ""); static_assert(NumDim_n > 0, ""); @@ -192,34 +190,10 @@ struct DeviceElementwise2dImpl : public DeviceElementwise(out_dev_buffers[I.value]); }, Number{}); - - in_grid_2d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeDescriptor_MN(lengths, - inStridesArray[I.value], - gridSize_, - blockSize_, - num_threads_m_, - num_threads_n_); - }, - Number{}); - - out_grid_2d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeDescriptor_MN(lengths, - outStridesArray[I.value], - gridSize_, - blockSize_, - num_threads_m_, - num_threads_n_); - }, - Number{}); } InDataTypePointerTuple in_dev_buffers_; OutDataTypePointerTuple out_dev_buffers_; - InGrid2dDescTuple in_grid_2d_desc_tuple_; - OutGrid2dDescTuple out_grid_2d_desc_tuple_; std::array lengths_; std::array, NumInput> inStridesArray_; @@ -227,15 +201,38 @@ struct DeviceElementwise2dImpl : public DeviceElementwise{}); + + auto out_grid_2d_desc_tuple = generate_tuple( + [&](auto I) { + return MakeDescriptor_MN(arg.lengths_, + arg.outStridesArray_[I.value], + gridSize, + arg.blockSize_, + num_threads_m, + num_threads_n); + }, + Number{}); + const auto kernel = kernel_elementwise_2d(out_dev_buffers[I.value]); }, Number{}); - - in_grid_1d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeDescriptor_M( - lengths, inStridesArray[I.value], gridSize_, blockSize_); - }, - Number{}); - - out_grid_1d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeDescriptor_M( - lengths, outStridesArray[I.value], gridSize_, blockSize_); - }, - Number{}); } InDataTypePointerTuple in_dev_buffers_; OutDataTypePointerTuple out_dev_buffers_; - InGrid1dDescTuple in_grid_1d_desc_tuple_; - OutGrid1dDescTuple out_grid_1d_desc_tuple_; std::array lengths_; std::array, NumInput> inStridesArray_; @@ -187,13 +171,28 @@ struct DeviceElementwiseImpl ElementwiseOperation elementwise_op_; index_t blockSize_; - index_t gridSize_; }; struct Invoker : public BaseInvoker { float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + index_t gridSize = getAvailableComputeUnitCount(stream_config); + + auto in_grid_1d_desc_tuple = generate_tuple( + [&](auto I) { + return MakeDescriptor_M( + arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_); + }, + Number{}); + + auto out_grid_1d_desc_tuple = generate_tuple( + [&](auto I) { + return MakeDescriptor_M( + arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_); + }, + Number{}); + const auto kernel = kernel_elementwise_1d