mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 19:40:04 +00:00
Using number of compute units to set gridSize (#754)
* Add getAvailableComputeUnitCount() interface
* Use available number of compute units to set kernel grid size
[ROCm/composable_kernel commit: c5f6ec842c]
This commit is contained in:
43
include/ck/host_utility/stream_utility.hpp
Normal file
43
include/ck/host_utility/stream_utility.hpp
Normal file
@@ -0,0 +1,43 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "ck/stream_config.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
|
||||
static int getAvailableComputeUnitCount(const StreamConfig& stream_config)
|
||||
{
|
||||
constexpr int MAX_MASK_DWORDS = 64;
|
||||
|
||||
// assume at most 64*32 = 2048 CUs
|
||||
uint32_t cuMask[MAX_MASK_DWORDS];
|
||||
|
||||
for(int i = 0; i < MAX_MASK_DWORDS; i++)
|
||||
cuMask[i] = 0;
|
||||
|
||||
auto countSetBits = [](uint32_t dword) {
|
||||
int count = 0;
|
||||
|
||||
while(dword != 0)
|
||||
{
|
||||
if(dword & 0x1)
|
||||
count++;
|
||||
|
||||
dword = dword >> 1;
|
||||
};
|
||||
|
||||
return (count);
|
||||
};
|
||||
|
||||
hip_check_error(hipExtStreamGetCUMask(stream_config.stream_id_, MAX_MASK_DWORDS, &cuMask[0]));
|
||||
|
||||
int ret = 0;
|
||||
|
||||
for(int i = 0; i < MAX_MASK_DWORDS; i++)
|
||||
ret += countSetBits(cuMask[i]);
|
||||
|
||||
return (ret);
|
||||
};
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/stream_utility.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -171,10 +172,7 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
inStridesArray_(inStridesArray),
|
||||
outStridesArray_(outStridesArray),
|
||||
elementwise_op_(elementwise_op),
|
||||
blockSize_(256),
|
||||
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
|
||||
num_threads_m_((gridSize_ * blockSize_) / 16),
|
||||
num_threads_n_(16)
|
||||
blockSize_(256)
|
||||
{
|
||||
static_assert(NumDim_m > 0, "");
|
||||
static_assert(NumDim_n > 0, "");
|
||||
@@ -192,34 +190,10 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
return static_cast<DataType*>(out_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
in_grid_2d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MN(lengths,
|
||||
inStridesArray[I.value],
|
||||
gridSize_,
|
||||
blockSize_,
|
||||
num_threads_m_,
|
||||
num_threads_n_);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
out_grid_2d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MN(lengths,
|
||||
outStridesArray[I.value],
|
||||
gridSize_,
|
||||
blockSize_,
|
||||
num_threads_m_,
|
||||
num_threads_n_);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
}
|
||||
|
||||
InDataTypePointerTuple in_dev_buffers_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
InGrid2dDescTuple in_grid_2d_desc_tuple_;
|
||||
OutGrid2dDescTuple out_grid_2d_desc_tuple_;
|
||||
|
||||
std::array<index_t, NumDim> lengths_;
|
||||
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
|
||||
@@ -227,15 +201,38 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
|
||||
ElementwiseOperation elementwise_op_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
index_t num_threads_m_;
|
||||
index_t num_threads_n_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
index_t gridSize = getAvailableComputeUnitCount(stream_config);
|
||||
index_t num_threads_m = (gridSize * arg.blockSize_) / 16;
|
||||
index_t num_threads_n = 16;
|
||||
|
||||
auto in_grid_2d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MN(arg.lengths_,
|
||||
arg.inStridesArray_[I.value],
|
||||
gridSize,
|
||||
arg.blockSize_,
|
||||
num_threads_m,
|
||||
num_threads_n);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_grid_2d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_MN(arg.lengths_,
|
||||
arg.outStridesArray_[I.value],
|
||||
gridSize,
|
||||
arg.blockSize_,
|
||||
num_threads_m,
|
||||
num_threads_n);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
const auto kernel = kernel_elementwise_2d<GridwiseElementwise,
|
||||
InGrid2dDescTuple,
|
||||
OutGrid2dDescTuple,
|
||||
@@ -245,16 +242,16 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(gridSize),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.in_grid_2d_desc_tuple_,
|
||||
arg.out_grid_2d_desc_tuple_,
|
||||
in_grid_2d_desc_tuple,
|
||||
out_grid_2d_desc_tuple,
|
||||
arg.in_dev_buffers_,
|
||||
arg.out_dev_buffers_,
|
||||
arg.elementwise_op_,
|
||||
arg.num_threads_m_,
|
||||
arg.num_threads_n_);
|
||||
num_threads_m,
|
||||
num_threads_n);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/stream_utility.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -144,8 +145,7 @@ struct DeviceElementwiseImpl
|
||||
inStridesArray_(inStridesArray),
|
||||
outStridesArray_(outStridesArray),
|
||||
elementwise_op_(elementwise_op),
|
||||
blockSize_(256),
|
||||
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
|
||||
blockSize_(256)
|
||||
{
|
||||
in_dev_buffers_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
@@ -160,26 +160,10 @@ struct DeviceElementwiseImpl
|
||||
return static_cast<DataType*>(out_dev_buffers[I.value]);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
in_grid_1d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
lengths, inStridesArray[I.value], gridSize_, blockSize_);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
out_grid_1d_desc_tuple_ = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
lengths, outStridesArray[I.value], gridSize_, blockSize_);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
}
|
||||
|
||||
InDataTypePointerTuple in_dev_buffers_;
|
||||
OutDataTypePointerTuple out_dev_buffers_;
|
||||
InGrid1dDescTuple in_grid_1d_desc_tuple_;
|
||||
OutGrid1dDescTuple out_grid_1d_desc_tuple_;
|
||||
|
||||
std::array<index_t, NumDim> lengths_;
|
||||
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
|
||||
@@ -187,13 +171,28 @@ struct DeviceElementwiseImpl
|
||||
|
||||
ElementwiseOperation elementwise_op_;
|
||||
index_t blockSize_;
|
||||
index_t gridSize_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
index_t gridSize = getAvailableComputeUnitCount(stream_config);
|
||||
|
||||
auto in_grid_1d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_);
|
||||
},
|
||||
Number<NumInput>{});
|
||||
|
||||
auto out_grid_1d_desc_tuple = generate_tuple(
|
||||
[&](auto I) {
|
||||
return MakeDescriptor_M(
|
||||
arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_);
|
||||
},
|
||||
Number<NumOutput>{});
|
||||
|
||||
const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
|
||||
InGrid1dDescTuple,
|
||||
OutGrid1dDescTuple,
|
||||
@@ -203,11 +202,11 @@ struct DeviceElementwiseImpl
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize_),
|
||||
dim3(gridSize),
|
||||
dim3(arg.blockSize_),
|
||||
0,
|
||||
arg.in_grid_1d_desc_tuple_,
|
||||
arg.out_grid_1d_desc_tuple_,
|
||||
in_grid_1d_desc_tuple,
|
||||
out_grid_1d_desc_tuple,
|
||||
arg.in_dev_buffers_,
|
||||
arg.out_dev_buffers_,
|
||||
arg.elementwise_op_);
|
||||
|
||||
Reference in New Issue
Block a user