MaxPool & AvgPool bwd instances, test, ckProfiler, client example (#861)

* Add maxpool instances

* Rename index pool to max pool.

* Add maxpool bwd bf16 instances

* Add avg pool bwd instances

* Rename avgpool and maxpool to avg_pool3d and max_pool

* Add bf16 pool fwd instances

* Add max pool bwd to ckProfiler

* Add avg pool3d bwd to ckProfiler

* Add avg pool bwd test

* Fix bug of reference pool fwd (dilation)

* Fix bug of max pool bwd  (dilation and initZero)

* Support bf16 compute data type

* Force compute type be f32. Because atomicAdd only support f32

* Add max pool bwd test

* Rename folder

* Rename pool

* Add max pool bwd client example

* Add avg pool bwd client example

* Add missing workspace

* clang format

* Rename macro

* remove useless header

* remove useless layout
This commit is contained in:
rocking
2023-08-31 21:01:50 +08:00
committed by GitHub
parent bf1912ed3d
commit 866377de18
45 changed files with 2174 additions and 40 deletions

View File

@@ -8,7 +8,7 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -25,7 +25,7 @@ template <typename DOutDataType,
typename IndexDataType,
typename DInDataType,
ck::index_t InOutVectorSize>
struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDataType, DInDataType>
struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>
{
using DInDataType_AutomicAddPreCast =
conditional_t<is_same_v<DInDataType, float> || is_same_v<DInDataType, double>,
@@ -91,7 +91,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
index_t dout_length,
index_t din_length,
const std::vector<ck::index_t>& window_lengths,
const std::vector<ck::index_t>& window_strides)
const std::vector<ck::index_t>& window_strides,
const std::vector<ck::index_t>& window_dilations)
: p_dout_{p_dout},
p_indices_{p_indices},
p_din_{p_din},
@@ -102,7 +103,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
{
for(size_t i = 0; i < window_lengths.size(); ++i)
{
windowOverlap_ |= window_lengths.at(i) > window_strides.at(i);
auto eff = (window_lengths.at(i) - 1) * window_dilations.at(i) + 1;
windowOverlap_ |= eff > window_strides.at(i);
}
}
@@ -228,6 +230,11 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
}
else
{
hip_check_error(hipMemsetAsync(arg.p_din_,
0,
arg.din_length_raw_ * sizeof(DInDataType),
stream_config.stream_id_));
const auto put_kernel = kernel_put_element_1d<GridwisePutElementSet,
InOutGrid1dDesc,
DOutDataType,
@@ -292,7 +299,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
index_t dout_length,
index_t din_length,
std::vector<ck::index_t> window_lengths,
std::vector<ck::index_t> window_strides) override
std::vector<ck::index_t> window_strides,
std::vector<ck::index_t> window_dilations) override
{
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// physical size of the packed tensor
@@ -302,7 +310,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
dout_length,
din_length,
window_lengths,
window_strides);
window_strides,
window_dilations);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override