mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-18 20:09:25 +00:00
MaxPool & AvgPool bwd instances, test, ckProfiler, client example (#861)
* Add maxpool instances
* Rename index pool to max pool.
* Add maxpool bwd bf16 instances
* Add avg pool bwd instances
* Rename avgpool and maxpool to avg_pool3d and max_pool
* Add bf16 pool fwd instances
* Add max pool bwd to ckProfiler
* Add avg pool3d bwd to ckProfiler
* Add avg pool bwd test
* Fix bug of reference pool fwd (dilation)
* Fix bug of max pool bwd (dilation and initZero)
* Support bf16 compute data type
* Force compute type be f32. Because atomicAdd only support f32
* Add max pool bwd test
* Rename folder
* Rename pool
* Add max pool bwd client example
* Add avg pool bwd client example
* Add missing workspace
* clang format
* Rename macro
* remove useless header
* remove useless layout
[ROCm/composable_kernel commit: 866377de18]
This commit is contained in:
@@ -13,7 +13,7 @@ namespace device {
|
||||
|
||||
// For pooling which used indexable operation, such as MaxPool, MinPool...etc
|
||||
template <typename DOutDataType, typename IndexDataType, typename DInDataType>
|
||||
struct DeviceIndexPoolBwd : public BaseOperator
|
||||
struct DeviceMaxPoolBwd : public BaseOperator
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_dout,
|
||||
@@ -22,7 +22,8 @@ struct DeviceIndexPoolBwd : public BaseOperator
|
||||
index_t dout_length,
|
||||
index_t din_length,
|
||||
std::vector<ck::index_t> window_lengths,
|
||||
std::vector<ck::index_t> window_strides) = 0;
|
||||
std::vector<ck::index_t> window_strides,
|
||||
std::vector<ck::index_t> window_dilations) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
@@ -25,7 +25,7 @@ template <typename DOutDataType,
|
||||
typename IndexDataType,
|
||||
typename DInDataType,
|
||||
ck::index_t InOutVectorSize>
|
||||
struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
|
||||
struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>
|
||||
{
|
||||
using DInDataType_AutomicAddPreCast =
|
||||
conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
|
||||
@@ -91,7 +91,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
|
||||
index_t dout_length,
|
||||
index_t din_length,
|
||||
const std::vector<ck::index_t>& window_lengths,
|
||||
const std::vector<ck::index_t>& window_strides)
|
||||
const std::vector<ck::index_t>& window_strides,
|
||||
const std::vector<ck::index_t>& window_dilations)
|
||||
: p_dout_{p_dout},
|
||||
p_indices_{p_indices},
|
||||
p_din_{p_din},
|
||||
@@ -102,7 +103,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
|
||||
{
|
||||
for(size_t i = 0; i < window_lengths.size(); ++i)
|
||||
{
|
||||
windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
|
||||
auto eff = (window_lengths.at(i) - 1) * window_dilations.at(i) + 1;
|
||||
windowOverlap_ |= eff > window_strides.at(i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -228,6 +230,11 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
|
||||
}
|
||||
else
|
||||
{
|
||||
hip_check_error(hipMemsetAsync(arg.p_din_,
|
||||
0,
|
||||
arg.din_length_raw_ * sizeof(DInDataType),
|
||||
stream_config.stream_id_));
|
||||
|
||||
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
|
||||
InOutGrid1dDesc,
|
||||
DOutDataType,
|
||||
@@ -292,7 +299,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
|
||||
index_t dout_length,
|
||||
index_t din_length,
|
||||
std::vector<ck::index_t> window_lengths,
|
||||
std::vector<ck::index_t> window_strides) override
|
||||
std::vector<ck::index_t> window_strides,
|
||||
std::vector<ck::index_t> window_dilations) override
|
||||
{
|
||||
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
|
||||
// physical size of the packed tensor
|
||||
@@ -302,7 +310,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
|
||||
dout_length,
|
||||
din_length,
|
||||
window_lengths,
|
||||
window_strides);
|
||||
window_strides,
|
||||
window_dilations);
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
@@ -116,7 +116,15 @@ struct Max
|
||||
template <typename T>
|
||||
__host__ __device__ static constexpr T GetIdentityValue()
|
||||
{
|
||||
return NumericLimits<T>::Lowest();
|
||||
if constexpr(is_same_v<T, bhalf_t>)
|
||||
{
|
||||
float val = NumericLimits<float>::Lowest();
|
||||
return type_convert<bhalf_t>(val);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NumericLimits<T>::Lowest();
|
||||
}
|
||||
};
|
||||
|
||||
__host__ __device__ static constexpr bool
|
||||
@@ -138,6 +146,15 @@ struct Max
|
||||
a = b;
|
||||
}
|
||||
|
||||
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
|
||||
{
|
||||
float a_ = type_convert<float>(a);
|
||||
float b_ = type_convert<float>(b);
|
||||
|
||||
if(a_ < b_)
|
||||
a = b;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
|
||||
{
|
||||
@@ -152,6 +169,18 @@ struct Max
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
|
||||
{
|
||||
float a_ = type_convert<float>(a);
|
||||
float b_ = type_convert<float>(b);
|
||||
|
||||
if(a_ < b_)
|
||||
{
|
||||
a = b;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct Min
|
||||
@@ -159,6 +188,15 @@ struct Min
|
||||
template <typename T>
|
||||
__host__ __device__ static constexpr T GetIdentityValue()
|
||||
{
|
||||
if constexpr(is_same_v<T, bhalf_t>)
|
||||
{
|
||||
float val = NumericLimits<float>::Max();
|
||||
return type_convert<bhalf_t>(val);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NumericLimits<T>::Max();
|
||||
}
|
||||
return NumericLimits<T>::Max();
|
||||
};
|
||||
|
||||
@@ -181,6 +219,15 @@ struct Min
|
||||
a = b;
|
||||
}
|
||||
|
||||
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
|
||||
{
|
||||
float a_ = type_convert<float>(a);
|
||||
float b_ = type_convert<float>(b);
|
||||
|
||||
if(a_ > b_)
|
||||
a = b;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
|
||||
{
|
||||
@@ -195,6 +242,18 @@ struct Min
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
|
||||
{
|
||||
float a_ = type_convert<float>(a);
|
||||
float b_ = type_convert<float>(b);
|
||||
|
||||
if(a_ > b_)
|
||||
{
|
||||
a = b;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct AMax
|
||||
|
||||
Reference in New Issue
Block a user