mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 18:42:06 +00:00
Common forward convolution utility refactor. (#141)
* Convolution ND
* Code unification across dimensions for generating tensor descriptors.
* Example
* Instances
* Move convnd f32 instance file to comply with repo structure.
* Conv 1D tensor layouts.
* Formatting and use ReferenceConv
* Reference ConvFwd supporting 1D and 2D convolution.
* Debug printing TensorLayout name.
* Conv fwd 1D instance f32
* Refactor conv ND example.
Needed to support various conv dimensio.
Needed to support various conv dimensions
* Rename conv nd example director to prevent conflicts.
* Refactor some common utility to single file.
Plus some tests.
* Refactor GetHostTensorDescriptor + UT.
* Add 1D test case.
* Test reference convolution 1d/2d
* Remove some leftovers.
* Fix convolution example error for 1D
* Refactor test check errors utility function.
* Test Conv2D Fwd XDL
* More UT for 1D case.
* Parameterize input & weight initializers.
* Rename example to prevent conflicts.
* Split convnd instance into separate files for 1d/2d
* Address review comments.
* Fix data type for flops/gbytes calculations.
* Assign example number 11.
* 3D cases for convolution utility functions.
* 3D reference convolution.
* Add support for 3D convolution.
* Check for inputs bigger than 2GB.
* Formatting
* Support for bf16/f16/f32/i8 - conv instances + UT.
* Use check_err from test_util.hpp.
* Split convnd test into separate files for each dim.
* Fix data generation and use proper instances.
* Formatting
* Skip tensor initialization if not necessary.
* Fix CMakefiles.
* Remove redundant conv2d_fwd test.
* Lower problem size for conv3D UT.
* 3D case for convnd example.
* Remove leftovers after merge.
* Add Conv Specialization string to GetTypeString
* Skip instance causing numerical errors.
* Small fixes.
* Remove redundant includes.
* Fix namespace name error.
* Script for automatic testing and logging convolution fwd UTs
* Comment out numactl cmd.
* Refine weights initalization and relax rtol for fp16
* Move test_util.hpp to check_err.hpp
* Refine weights initalization and relax rtol for fp16
* Refactor common part of test conv utils.
* Move utility function to single common place.
* Add additional common functions to utility.
* Refactor convnd_fwd_xdl examples.
* Remove redundant files.
* Unify structure.
* Add constructor to ConvParams.
* And add input parameters validation.
* Modify conv examples to use single utility file.
* Remove check_error from host_tensor.hpp
* Get rid of check_indices function.
* Remove bf16_to_f32 function overload for scalars.
* Fix namespace.
* Add half_float::half for check_err.
* Fix conv params size in UT.
* Fix weights initialization for int8.
* Fix weights initialization for int8.
* Add type_convert when store output in ref conv 1D.
* Get back old conv2d_fwd_xdl operation.
* Silence conv debug print.
* format
* clean
* clean
* Fix merge.
* Fix namespace for check_err
* Formatting.
* Fix merge artifacts.
* Remove deleted header.
* Fix some includes and use ck::utils::check_err.
* Remove unused check_indices restored by previous merge.
* Fix namespaces after merge.
* Fix compilation error.
* Small fixes.
* Use common functions.
* Fix filename
* Fix namespaces.
* Fix merge artifact - retrieve removed by accident fun.
* Fix ConvForwardSpecialization.
* Adhere to coding style rules.
* Fix merge artifacts.
Co-authored-by: Adam Osewski <aosewski@amd.com>
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: abf4bdb9a9]
This commit is contained in:
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -227,7 +229,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_f32_result);
|
||||
ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -196,7 +198,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -219,7 +221,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -244,6 +246,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -230,6 +232,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -248,6 +250,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
add_example_executable(example_conv2d_fwd_xdl_fp16 conv2d_fwd_xdl_fp16.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_int8 conv2d_fwd_xdl_int8.cpp)
|
||||
@@ -1,24 +0,0 @@
|
||||
# Instructions for ```example_conv2d_fwd_xdl```
|
||||
|
||||
## Run ```example_conv2d_fwd_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./bin/example_conv2d_fwd_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```
|
||||
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
|
||||
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
|
||||
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
|
||||
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
|
||||
arg.c_grid_desc_m_n_{ 165888, 256}
|
||||
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 5 times...
|
||||
Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
|
||||
```
|
||||
@@ -1,274 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "convolution_utility.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWK;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
InDataType, // InDataType
|
||||
WeiDataType, // WeiDataType
|
||||
OutDataType, // OutDataType
|
||||
AccDataType, // AccDataType
|
||||
InElementOp, // InElementwiseOperation
|
||||
WeiElementOp, // WeiElementwiseOperation
|
||||
OutElementOp, // OutElementwiseOperation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
8, // K1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockLdsAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
using ReferenceConvFwdInstance = ck::tensor_operation::host::
|
||||
ReferenceConvFwd<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
|
||||
// Conv shape
|
||||
ck::index_t N = 128;
|
||||
ck::index_t K = 256;
|
||||
ck::index_t C = 192;
|
||||
ck::index_t Y = 3;
|
||||
ck::index_t X = 3;
|
||||
ck::index_t Hi = 71;
|
||||
ck::index_t Wi = 71;
|
||||
ck::index_t conv_stride_h = 2;
|
||||
ck::index_t conv_stride_w = 2;
|
||||
ck::index_t conv_dilation_h = 1;
|
||||
ck::index_t conv_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 19)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
C = std::stoi(argv[6]);
|
||||
Y = std::stoi(argv[7]);
|
||||
X = std::stoi(argv[8]);
|
||||
Hi = std::stoi(argv[9]);
|
||||
Wi = std::stoi(argv[10]);
|
||||
conv_stride_h = std::stoi(argv[11]);
|
||||
conv_stride_w = std::stoi(argv[12]);
|
||||
conv_dilation_h = std::stoi(argv[13]);
|
||||
conv_dilation_w = std::stoi(argv[14]);
|
||||
in_left_pad_h = std::stoi(argv[15]);
|
||||
in_left_pad_w = std::stoi(argv[16]);
|
||||
in_right_pad_h = std::stoi(argv[17]);
|
||||
in_right_pad_w = std::stoi(argv[18]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
|
||||
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
|
||||
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
|
||||
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
|
||||
const auto output_spatial_lengths =
|
||||
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
|
||||
{Y, X},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
const ck::index_t Ho = output_spatial_lengths[0];
|
||||
const ck::index_t Wo = output_spatial_lengths[1];
|
||||
|
||||
// tensor layout
|
||||
auto f_host_tensor_descriptor = [](std::size_t N_,
|
||||
std::size_t C_,
|
||||
std::size_t H,
|
||||
std::size_t W,
|
||||
auto layout) {
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::KYXC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
|
||||
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_host_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_device_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) *
|
||||
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
|
||||
// do GEMM
|
||||
auto conv = DeviceConvFwdInstance{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
std::vector<ck::index_t>{Hi, Wi},
|
||||
std::vector<ck::index_t>{Y, X},
|
||||
std::vector<ck::index_t>{Ho, Wo},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
|
||||
sizeof(WeiDataType) * (K * C * Y * X) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_conv = ReferenceConvFwdInstance{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
|
||||
wei_k_c_y_x,
|
||||
out_n_k_ho_wo_host_result,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
}
|
||||
}
|
||||
@@ -1,275 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "convolution_utility.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using OutDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWK;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
int8_t, // InDataType
|
||||
int8_t, // WeiDataType
|
||||
int8_t, // OutDataType
|
||||
int32_t, // AccDataType
|
||||
PassThrough, // InElementwiseOperation
|
||||
PassThrough, // WeiElementwiseOperation
|
||||
PassThrough, // OutElementwiseOperation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
16, // K1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockLdsAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
using ReferenceConvFwdInstance = ck::tensor_operation::host::
|
||||
ReferenceConvFwd<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
|
||||
// Conv shape
|
||||
ck::index_t N = 128;
|
||||
ck::index_t K = 256;
|
||||
ck::index_t C = 192;
|
||||
ck::index_t Y = 3;
|
||||
ck::index_t X = 3;
|
||||
ck::index_t Hi = 71;
|
||||
ck::index_t Wi = 71;
|
||||
ck::index_t conv_stride_h = 2;
|
||||
ck::index_t conv_stride_w = 2;
|
||||
ck::index_t conv_dilation_h = 1;
|
||||
ck::index_t conv_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 19)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
C = std::stoi(argv[6]);
|
||||
Y = std::stoi(argv[7]);
|
||||
X = std::stoi(argv[8]);
|
||||
Hi = std::stoi(argv[9]);
|
||||
Wi = std::stoi(argv[10]);
|
||||
conv_stride_h = std::stoi(argv[11]);
|
||||
conv_stride_w = std::stoi(argv[12]);
|
||||
conv_dilation_h = std::stoi(argv[13]);
|
||||
conv_dilation_w = std::stoi(argv[14]);
|
||||
in_left_pad_h = std::stoi(argv[15]);
|
||||
in_left_pad_w = std::stoi(argv[16]);
|
||||
in_right_pad_h = std::stoi(argv[17]);
|
||||
in_right_pad_w = std::stoi(argv[18]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
|
||||
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
|
||||
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
|
||||
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
|
||||
const auto output_spatial_lengths =
|
||||
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
|
||||
{Y, X},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
const ck::index_t Ho = output_spatial_lengths[0];
|
||||
const ck::index_t Wo = output_spatial_lengths[1];
|
||||
|
||||
// tensor layout
|
||||
auto f_host_tensor_descriptor = [](std::size_t N_,
|
||||
std::size_t C_,
|
||||
std::size_t H,
|
||||
std::size_t W,
|
||||
auto layout) {
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::KYXC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
|
||||
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_host_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_device_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-1, 1});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-1, 1});
|
||||
break;
|
||||
default:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) *
|
||||
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
|
||||
// do GEMM
|
||||
auto conv = DeviceConvFwdInstance{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
std::vector<ck::index_t>{Hi, Wi},
|
||||
std::vector<ck::index_t>{Y, X},
|
||||
std::vector<ck::index_t>{Ho, Wo},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
|
||||
sizeof(WeiDataType) * (K * C * Y * X) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_conv = ReferenceConvFwdInstance{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
|
||||
wei_k_c_y_x,
|
||||
out_n_k_ho_wo_host_result,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
}
|
||||
}
|
||||
@@ -4,17 +4,20 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
|
||||
#include "reference_conv_fwd_bias_activation.hpp"
|
||||
#include "convolution_utility.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
@@ -86,146 +89,157 @@ using ReferenceConvFwdInstance =
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
void PrintUseMsg()
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "Following arguments:\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
<< " <strides>, (ie Sy, Sx for 2D)\n"
|
||||
<< " <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Conv shape
|
||||
ck::index_t N = 128;
|
||||
ck::index_t K = 256;
|
||||
ck::index_t C = 192;
|
||||
ck::index_t Y = 3;
|
||||
ck::index_t X = 3;
|
||||
ck::index_t Hi = 71;
|
||||
ck::index_t Wi = 71;
|
||||
ck::index_t conv_stride_h = 2;
|
||||
ck::index_t conv_stride_w = 2;
|
||||
ck::index_t conv_dilation_h = 1;
|
||||
ck::index_t conv_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
if(argc == 4)
|
||||
ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
int num_dim_spatial = 2;
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
int cmdline_nargs = conv_args + 4;
|
||||
if(cmdline_nargs != argc)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 19)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
C = std::stoi(argv[6]);
|
||||
Y = std::stoi(argv[7]);
|
||||
X = std::stoi(argv[8]);
|
||||
Hi = std::stoi(argv[9]);
|
||||
Wi = std::stoi(argv[10]);
|
||||
conv_stride_h = std::stoi(argv[11]);
|
||||
conv_stride_w = std::stoi(argv[12]);
|
||||
conv_dilation_h = std::stoi(argv[13]);
|
||||
conv_dilation_w = std::stoi(argv[14]);
|
||||
in_left_pad_h = std::stoi(argv[15]);
|
||||
in_left_pad_w = std::stoi(argv[16]);
|
||||
in_right_pad_h = std::stoi(argv[17]);
|
||||
in_right_pad_w = std::stoi(argv[18]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
PrintUseMsg();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
|
||||
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
|
||||
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
|
||||
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
|
||||
const auto output_spatial_lengths =
|
||||
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
|
||||
{Y, X},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 4;
|
||||
|
||||
const ck::index_t Ho = output_spatial_lengths[0];
|
||||
const ck::index_t Wo = output_spatial_lengths[1];
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
params.N = std::stoi(argv[arg_idx++]);
|
||||
params.K = std::stoi(argv[arg_idx++]);
|
||||
params.C = std::stoi(argv[arg_idx++]);
|
||||
|
||||
// tensor layout
|
||||
auto f_host_tensor_descriptor = [](std::size_t N_,
|
||||
std::size_t C_,
|
||||
std::size_t H,
|
||||
std::size_t W,
|
||||
auto layout) {
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::KYXC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
}
|
||||
};
|
||||
params.filter_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_strides.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_dilations.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_left_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_right_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
|
||||
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_host_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_device_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
return params;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
const int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
if(argc >= 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if(argc >= 5)
|
||||
{
|
||||
params = ParseConvParams(argc, argv);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
// bias: assume contiguous 1d vector
|
||||
Tensor<OutDataType> bias_k(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
|
||||
Tensor<OutDataType> bias(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K)})));
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
|
||||
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
|
||||
std::cout << "input: " << input.mDesc << std::endl;
|
||||
std::cout << "weights: " << weights.mDesc << std::endl;
|
||||
std::cout << "output: " << host_output.mDesc << std::endl;
|
||||
std::cout << "bias: " << bias.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) *
|
||||
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
|
||||
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
|
||||
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
bias_device_buf.ToDevice(bias_k.mData.data());
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
bias_device_buf.ToDevice(bias.mData.data());
|
||||
|
||||
auto conv = DeviceConvFwdInstance{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
@@ -234,16 +248,16 @@ int main(int argc, char* argv[])
|
||||
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
std::vector<ck::index_t>{Hi, Wi},
|
||||
std::vector<ck::index_t>{Y, X},
|
||||
std::vector<ck::index_t>{Ho, Wo},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
@@ -257,16 +271,19 @@ int main(int argc, char* argv[])
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
|
||||
sizeof(WeiDataType) * (K * C * Y * X) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
std::size_t flop = get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype =
|
||||
get_btype<InDataType, WeiDataType, OutDataType>(params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths) +
|
||||
sizeof(OutDataType) * (params.K);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
@@ -275,21 +292,20 @@ int main(int argc, char* argv[])
|
||||
auto ref_conv = ReferenceConvFwdInstance{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
|
||||
wei_k_c_y_x,
|
||||
out_n_k_ho_wo_host_result,
|
||||
bias_k,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
host_output,
|
||||
bias,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
ck::utils::check_err(
|
||||
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,17 +4,20 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
|
||||
#include "reference_conv_fwd_bias_activation_add.hpp"
|
||||
#include "convolution_utility.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
@@ -83,154 +86,166 @@ using ReferenceConvFwdInstance =
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
void PrintUseMsg()
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "Following arguments:\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
<< " <strides>, (ie Sy, Sx for 2D)\n"
|
||||
<< " <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Conv shape
|
||||
ck::index_t N = 128;
|
||||
ck::index_t K = 256;
|
||||
ck::index_t C = 192;
|
||||
ck::index_t Y = 3;
|
||||
ck::index_t X = 3;
|
||||
ck::index_t Hi = 71;
|
||||
ck::index_t Wi = 71;
|
||||
ck::index_t conv_stride_h = 2;
|
||||
ck::index_t conv_stride_w = 2;
|
||||
ck::index_t conv_dilation_h = 1;
|
||||
ck::index_t conv_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
if(argc == 4)
|
||||
ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
int num_dim_spatial = 2;
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
int cmdline_nargs = conv_args + 4;
|
||||
if(cmdline_nargs != argc)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 19)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
C = std::stoi(argv[6]);
|
||||
Y = std::stoi(argv[7]);
|
||||
X = std::stoi(argv[8]);
|
||||
Hi = std::stoi(argv[9]);
|
||||
Wi = std::stoi(argv[10]);
|
||||
conv_stride_h = std::stoi(argv[11]);
|
||||
conv_stride_w = std::stoi(argv[12]);
|
||||
conv_dilation_h = std::stoi(argv[13]);
|
||||
conv_dilation_w = std::stoi(argv[14]);
|
||||
in_left_pad_h = std::stoi(argv[15]);
|
||||
in_left_pad_w = std::stoi(argv[16]);
|
||||
in_right_pad_h = std::stoi(argv[17]);
|
||||
in_right_pad_w = std::stoi(argv[18]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
PrintUseMsg();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
|
||||
const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
|
||||
const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
|
||||
const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
|
||||
const auto output_spatial_lengths =
|
||||
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
|
||||
{Y, X},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 4;
|
||||
|
||||
const ck::index_t Ho = output_spatial_lengths[0];
|
||||
const ck::index_t Wo = output_spatial_lengths[1];
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
params.N = std::stoi(argv[arg_idx++]);
|
||||
params.K = std::stoi(argv[arg_idx++]);
|
||||
params.C = std::stoi(argv[arg_idx++]);
|
||||
|
||||
// tensor layout
|
||||
auto f_host_tensor_descriptor = [](std::size_t N_,
|
||||
std::size_t C_,
|
||||
std::size_t H,
|
||||
std::size_t W,
|
||||
auto layout) {
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::KYXC>::value ||
|
||||
ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
}
|
||||
};
|
||||
params.filter_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_strides.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_dilations.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_left_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_right_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
|
||||
Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_host_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
Tensor<OutDataType> out_n_k_ho_wo_device_result(
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
return params;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
const int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
if(argc >= 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if(argc >= 5)
|
||||
{
|
||||
params = ParseConvParams(argc, argv);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
// bias: assume contiguous 1d vector
|
||||
Tensor<OutDataType> bias_k(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
|
||||
Tensor<OutDataType> bias(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K)})));
|
||||
|
||||
// residual: assume same layout as output tensor
|
||||
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
Tensor<OutDataType> residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
|
||||
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
|
||||
std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
|
||||
std::cout << "input: " << input.mDesc << std::endl;
|
||||
std::cout << "weights: " << weights.mDesc << std::endl;
|
||||
std::cout << "output: " << host_output.mDesc << std::endl;
|
||||
std::cout << "bias: " << bias.mDesc << std::endl;
|
||||
std::cout << "residual: " << residual.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
residual.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
residual.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) *
|
||||
out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
|
||||
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
|
||||
DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpace());
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
|
||||
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
|
||||
DeviceMem resi_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
bias_device_buf.ToDevice(bias_k.mData.data());
|
||||
resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
bias_device_buf.ToDevice(bias.mData.data());
|
||||
resi_device_buf.ToDevice(residual.mData.data());
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
@@ -244,16 +259,16 @@ int main(int argc, char* argv[])
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
|
||||
static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
std::vector<ck::index_t>{Hi, Wi},
|
||||
std::vector<ck::index_t>{Y, X},
|
||||
std::vector<ck::index_t>{Ho, Wo},
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
@@ -267,17 +282,21 @@ int main(int argc, char* argv[])
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
|
||||
sizeof(WeiDataType) * (K * C * Y * X) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
std::size_t flop = get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype =
|
||||
get_btype<InDataType, WeiDataType, OutDataType>(params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths) +
|
||||
sizeof(OutDataType) * (params.K) +
|
||||
sizeof(OutDataType) *
|
||||
(params.N * params.K * output_spatial_lengths[0] * output_spatial_lengths[1]);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
@@ -286,23 +305,22 @@ int main(int argc, char* argv[])
|
||||
auto ref_conv = ReferenceConvFwdInstance{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
|
||||
wei_k_c_y_x,
|
||||
out_n_k_ho_wo_host_result,
|
||||
bias_k,
|
||||
resi_n_k_ho_wo,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
host_output,
|
||||
bias,
|
||||
residual,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
ck::utils::check_err(
|
||||
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
add_example_executable(example_conv3d_fwd_xdl conv3d_fwd_xdl.cpp)
|
||||
@@ -1,24 +0,0 @@
|
||||
# Instructions for ```example_conv3d_fwd_xdl```
|
||||
|
||||
## Run ```example_conv3d_fwd_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx
|
||||
./bin/example_conv3d_fwd_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```
|
||||
wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1}
|
||||
out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1}
|
||||
num_batches_of_GEMM = 1
|
||||
a_grid_desc_k0_m_k1{648, 186624, 8}
|
||||
b_grid_desc_k0_n_k1{648, 256, 8}
|
||||
c_grid_desc_m_n{ 186624, 256}
|
||||
launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 5 times...
|
||||
Perf: 4.58795 ms, 107.965 TFlops, 141.23 GB/s
|
||||
```
|
||||
@@ -1,281 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_base.hpp"
|
||||
#include "device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp"
|
||||
#include "device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp"
|
||||
#include "convolution_utility.hpp"
|
||||
|
||||
// convolution data type
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWK;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConv3dFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<
|
||||
InDataType, // InData
|
||||
WeiDataType, // WeiData
|
||||
OutDataType, // OutData
|
||||
AccDataType, // AccData
|
||||
InElementOp, // InElementwise Operation
|
||||
WeiElementOp, // WeiElementwise Operation
|
||||
OutElementOp, // OutElementwise Operation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
8, // K1. K0PerBlock * K1 = KPerBlock
|
||||
32, // MPerXDL
|
||||
32, // NPerXDL. Each XDL computes a matrix of size (MPerXDL, NPerBlock)
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockLdsAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = false;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
|
||||
// convolution shape
|
||||
ck::index_t N = 4;
|
||||
ck::index_t K = 256;
|
||||
ck::index_t C = 192;
|
||||
std::vector<ck::index_t> in_spatial_lengths = {71, 71, 71};
|
||||
std::vector<ck::index_t> filter_spatial_lengths = {3, 3, 3};
|
||||
std::vector<ck::index_t> conv_filter_strides = {2, 2, 2};
|
||||
std::vector<ck::index_t> conv_filter_dilations = {1, 1, 1};
|
||||
std::vector<ck::index_t> in_left_pads = {1, 1, 1};
|
||||
std::vector<ck::index_t> in_right_pads = {1, 1, 1};
|
||||
|
||||
if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 25)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
C = std::stoi(argv[6]);
|
||||
filter_spatial_lengths[0] = std::stoi(argv[7]);
|
||||
filter_spatial_lengths[1] = std::stoi(argv[8]);
|
||||
filter_spatial_lengths[2] = std::stoi(argv[9]);
|
||||
in_spatial_lengths[0] = std::stoi(argv[10]);
|
||||
in_spatial_lengths[1] = std::stoi(argv[11]);
|
||||
in_spatial_lengths[2] = std::stoi(argv[12]);
|
||||
conv_filter_strides[0] = std::stoi(argv[13]);
|
||||
conv_filter_strides[1] = std::stoi(argv[14]);
|
||||
conv_filter_strides[2] = std::stoi(argv[15]);
|
||||
conv_filter_dilations[0] = std::stoi(argv[16]);
|
||||
conv_filter_dilations[1] = std::stoi(argv[17]);
|
||||
conv_filter_dilations[2] = std::stoi(argv[18]);
|
||||
in_left_pads[0] = std::stoi(argv[19]);
|
||||
in_left_pads[1] = std::stoi(argv[20]);
|
||||
in_left_pads[2] = std::stoi(argv[21]);
|
||||
in_right_pads[0] = std::stoi(argv[22]);
|
||||
in_right_pads[1] = std::stoi(argv[23]);
|
||||
in_right_pads[2] = std::stoi(argv[24]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Usage: 3 or 24 input arguments\n");
|
||||
printf(" arg1: verification (0=no, 1=yes)\n");
|
||||
printf(" arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf(" arg3: run kernel # of times (>1)\n");
|
||||
printf(" arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sz, Dz, Dy, Dx, LeftPz, LeftPy, "
|
||||
"LeftPz, RightPz, RightPy, RightPx\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto conv3d = DeviceConv3dFwdInstance{};
|
||||
|
||||
const auto out_spatial_lengths =
|
||||
ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths(
|
||||
in_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads);
|
||||
Tensor<InDataType> in(
|
||||
{N, in_spatial_lengths[0], in_spatial_lengths[1], in_spatial_lengths[2], C});
|
||||
Tensor<WeiDataType> wei(
|
||||
{K, filter_spatial_lengths[0], filter_spatial_lengths[1], filter_spatial_lengths[2], C});
|
||||
Tensor<OutDataType> out(
|
||||
{N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K});
|
||||
|
||||
std::cout << "in: " << in.mDesc << std::endl;
|
||||
std::cout << "wei: " << wei.mDesc << std::endl;
|
||||
std::cout << "out: " << out.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
|
||||
// do Convolution
|
||||
auto invoker = conv3d.MakeInvoker();
|
||||
auto argument = conv3d.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv3d.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv3d with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
|
||||
const auto Di = in_spatial_lengths[0];
|
||||
const auto Hi = in_spatial_lengths[1];
|
||||
const auto Wi = in_spatial_lengths[2];
|
||||
const auto Do = out_spatial_lengths[0];
|
||||
const auto Ho = out_spatial_lengths[1];
|
||||
const auto Wo = out_spatial_lengths[2];
|
||||
const auto Z = filter_spatial_lengths[0];
|
||||
const auto Y = filter_spatial_lengths[1];
|
||||
const auto X = filter_spatial_lengths[2];
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Do * Ho * Wo * C * Z * Y * X;
|
||||
std::size_t num_btype = sizeof(InDataType) * N * Di * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * N * Do * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
out_device_buf.FromDevice(out.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
DeviceMem out_ref_device_buf(sizeof(OutDataType) * N * Do * Ho * Wo * K);
|
||||
|
||||
using DeviceConv3dFwdNaive = ck::tensor_operation::device::
|
||||
DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
auto conv3d_naive = DeviceConv3dFwdNaive{};
|
||||
auto invoker_naive = conv3d_naive.MakeInvoker();
|
||||
auto argument_naive = conv3d_naive.MakeArgument(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_ref_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv3d_naive.IsSupportedArgument(argument_naive))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv3d_naive does NOT support the specified compilation parameters");
|
||||
}
|
||||
invoker_naive.Run(argument_naive);
|
||||
|
||||
Tensor<OutDataType> out_ref(
|
||||
{N, out_spatial_lengths[0], out_spatial_lengths[1], out_spatial_lengths[2], K});
|
||||
|
||||
out_ref_device_buf.FromDevice(out_ref.mData.data());
|
||||
|
||||
check_error(out_ref, out);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1 +1,3 @@
|
||||
add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
|
||||
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
|
||||
add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
@@ -13,6 +15,8 @@
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
@@ -80,7 +84,7 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
OutElementOp,
|
||||
NumDimSpatial>;
|
||||
|
||||
DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial)
|
||||
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
|
||||
{
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
@@ -99,7 +103,7 @@ DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial)
|
||||
}
|
||||
}
|
||||
|
||||
void PrintUseMsg()
|
||||
void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
@@ -116,18 +120,18 @@ void PrintUseMsg()
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* argv[])
|
||||
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
int cmdline_nargs = conv_args + 5;
|
||||
if(cmdline_nargs != argc)
|
||||
{
|
||||
PrintUseMsg();
|
||||
print_use_msg();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 5;
|
||||
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
@@ -169,80 +173,18 @@ ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, int argc, char* a
|
||||
return params;
|
||||
}
|
||||
|
||||
HostTensorDescriptor GetOutputHostTensorDescriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWK{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWK{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HostTensorDescriptor GetFiltersHostTensorDescriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KZYXC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KXC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HostTensorDescriptor GetInputHostTensorDescriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
if(argc >= 5)
|
||||
{
|
||||
@@ -254,7 +196,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(argc >= 6)
|
||||
{
|
||||
params = ParseConvParams(num_dim_spatial, argc, argv);
|
||||
params = parse_conv_params(num_dim_spatial, argc, argv);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
@@ -276,10 +218,12 @@ int main(int argc, char* argv[])
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(GetInputHostTensorDescriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(GetFiltersHostTensorDescriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(GetOutputHostTensorDescriptor(output_dims, num_dim_spatial));
|
||||
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
std::cout << "input: " << input.mDesc << std::endl;
|
||||
std::cout << "weights: " << weights.mDesc << std::endl;
|
||||
@@ -305,7 +249,7 @@ int main(int argc, char* argv[])
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
|
||||
// do GEMM
|
||||
auto conv = GetConvInstance(num_dim_spatial);
|
||||
auto conv = get_conv_instance(num_dim_spatial);
|
||||
auto invoker = conv->MakeInvokerPointer();
|
||||
auto argument =
|
||||
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
@@ -334,15 +278,15 @@ int main(int argc, char* argv[])
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
|
||||
std::size_t flop = ck::conv_util::GetFlops(
|
||||
std::size_t flop = get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype =
|
||||
ck::conv_util::GetBtype<InDataType, WeiDataType, OutDataType>(params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
get_btype<InDataType, WeiDataType, OutDataType>(params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
@@ -367,7 +311,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
check_error(host_output, device_output);
|
||||
ck::utils::check_err(
|
||||
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
};
|
||||
|
||||
switch(num_dim_spatial)
|
||||
|
||||
341
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
Normal file
341
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
Normal file
@@ -0,0 +1,341 @@
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWK;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdBasePtr =
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
|
||||
|
||||
template <ck::index_t NumDimSpatial>
|
||||
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
// clang-format off
|
||||
InDataType, //
|
||||
WeiDataType, //
|
||||
OutDataType, //
|
||||
AccDataType, //
|
||||
InElementOp, // Input Elementwise Operation
|
||||
WeiElementOp, // Weights Elementwise Operation
|
||||
OutElementOp, // Output Elementwise Operation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
NumDimSpatial, // NumDimSpatial
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
8, // K1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockLdsAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
template <ck::index_t NumDimSpatial>
|
||||
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
NumDimSpatial>;
|
||||
|
||||
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
|
||||
{
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<3>>();
|
||||
}
|
||||
case 2: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<2>>();
|
||||
}
|
||||
case 1: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<1>>();
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
<< " <strides>, (ie Sy, Sx for 2D)\n"
|
||||
<< " <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
int cmdline_nargs = conv_args + 5;
|
||||
if(cmdline_nargs != argc)
|
||||
{
|
||||
print_use_msg();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 5;
|
||||
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
params.N = std::stoi(argv[arg_idx++]);
|
||||
params.K = std::stoi(argv[arg_idx++]);
|
||||
params.C = std::stoi(argv[arg_idx++]);
|
||||
|
||||
params.filter_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_strides.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_dilations.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_left_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_right_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
if(argc >= 5)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
if(argc >= 6)
|
||||
{
|
||||
params = parse_conv_params(num_dim_spatial, argc, argv);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
std::cout << "input: " << input.mDesc << std::endl;
|
||||
std::cout << "weights: " << weights.mDesc << std::endl;
|
||||
std::cout << "output: " << host_output.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
|
||||
// do GEMM
|
||||
auto conv = get_conv_instance(num_dim_spatial);
|
||||
auto invoker = conv->MakeInvokerPointer();
|
||||
auto argument =
|
||||
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
|
||||
params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto verify_f = [&input, &weights, &host_output, ¶ms, &out_device_buf, &device_output](
|
||||
const auto& ref_conv) {
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
host_output,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
ck::utils::check_err(
|
||||
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
};
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<3>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<2>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<1>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
343
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
Normal file
343
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
Normal file
@@ -0,0 +1,343 @@
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using OutDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWK;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdBasePtr =
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
|
||||
|
||||
template <ck::index_t NumDimSpatial>
|
||||
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
// clang-format off
|
||||
InDataType, //
|
||||
WeiDataType, //
|
||||
OutDataType, //
|
||||
AccDataType, //
|
||||
InElementOp, // Input Elementwise Operation
|
||||
WeiElementOp, // Weights Elementwise Operation
|
||||
OutElementOp, // Output Elementwise Operation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
NumDimSpatial, // NumDimSpatial
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
16, // K1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockLdsAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
template <ck::index_t NumDimSpatial>
|
||||
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
NumDimSpatial>;
|
||||
|
||||
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
|
||||
{
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<3>>();
|
||||
}
|
||||
case 2: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<2>>();
|
||||
}
|
||||
case 1: {
|
||||
return std::make_unique<DeviceConvNDFwdInstance<1>>();
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
<< " <strides>, (ie Sy, Sx for 2D)\n"
|
||||
<< " <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
int cmdline_nargs = conv_args + 5;
|
||||
if(cmdline_nargs != argc)
|
||||
{
|
||||
print_use_msg();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 5;
|
||||
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
params.N = std::stoi(argv[arg_idx++]);
|
||||
params.K = std::stoi(argv[arg_idx++]);
|
||||
params.C = std::stoi(argv[arg_idx++]);
|
||||
|
||||
params.filter_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_spatial_lengths.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_strides.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.conv_filter_dilations.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_left_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
params.input_right_pads.resize(num_dim_spatial);
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
if(argc >= 5)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
if(argc >= 6)
|
||||
{
|
||||
params = parse_conv_params(num_dim_spatial, argc, argv);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
std::cout << "input: " << input.mDesc << std::endl;
|
||||
std::cout << "weights: " << weights.mDesc << std::endl;
|
||||
std::cout << "output: " << host_output.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
|
||||
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
|
||||
// do GEMM
|
||||
auto conv = get_conv_instance(num_dim_spatial);
|
||||
auto invoker = conv->MakeInvokerPointer();
|
||||
auto argument =
|
||||
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
|
||||
params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto verify_f = [&input, &weights, &host_output, ¶ms, &out_device_buf, &device_output](
|
||||
const auto& ref_conv) {
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
host_output,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
ck::utils::check_err(
|
||||
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
};
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<3>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<2>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
auto ref_conv = ReferenceConvNDFwdInstance<1>();
|
||||
verify_f(ref_conv);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -247,6 +249,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
|
||||
|
||||
check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
|
||||
ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -284,6 +286,6 @@ int main(int argc, char* argv[])
|
||||
LogRangeAsType<float>(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",")
|
||||
<< std::endl;
|
||||
}
|
||||
check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
|
||||
ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <getopt.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -371,12 +373,13 @@ int main(int argc, char* argv[])
|
||||
if(args.do_verification)
|
||||
{
|
||||
out_dev.FromDevice(out.mData.data());
|
||||
check_error(out_ref, out);
|
||||
ck::utils::check_err(out.mData, out_ref.mData);
|
||||
|
||||
if(NeedIndices)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
check_indices(out_indices_ref, out_indices);
|
||||
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
|
||||
;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -300,13 +302,14 @@ int main(int argc, char* argv[])
|
||||
|
||||
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
|
||||
|
||||
check_error(out_n_c_ho_wo_host, out_n_c_ho_wo_device);
|
||||
ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
|
||||
|
||||
if constexpr(NeedIndices)
|
||||
{
|
||||
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
|
||||
|
||||
// check_indices(out_indices_n_c_ho_wo_host, out_indices_n_c_ho_wo_device);
|
||||
// ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
|
||||
// out_indices_n_c_ho_wo_host.mData);;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -225,7 +227,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -225,8 +227,7 @@ int main(int argc, char* argv[])
|
||||
c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_host_tensors[i], c_device_tensors[i]);
|
||||
ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <half.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -99,10 +99,10 @@ void print_use_msg()
|
||||
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
|
||||
<< std::endl;
|
||||
}
|
||||
ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
|
||||
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
int arg_idx = 5;
|
||||
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
@@ -144,72 +144,6 @@ ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
|
||||
return params;
|
||||
}
|
||||
|
||||
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KZYXC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::KXC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWK{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, tl::NWK{});
|
||||
}
|
||||
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
|
||||
{
|
||||
switch(num_dim_spatial)
|
||||
@@ -236,7 +170,7 @@ int main(int argc, char* argv[])
|
||||
int nrepeat = 5;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.C = 128;
|
||||
|
||||
if(argc == 4)
|
||||
@@ -288,13 +222,13 @@ int main(int argc, char* argv[])
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> in_n_c_hi_wi_host_result(
|
||||
get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
ck::utils::conv::get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<InDataType> in_n_c_hi_wi_device_result(
|
||||
get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
ck::utils::conv::get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
|
||||
Tensor<WeiDataType> wei_k_c_y_x(
|
||||
get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
|
||||
Tensor<OutDataType> out_n_k_ho_wo(
|
||||
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
ck::utils::conv::get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
@@ -352,15 +286,15 @@ int main(int argc, char* argv[])
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
|
||||
std::size_t flop = ck::conv_util::GetFlops(
|
||||
std::size_t flop = ck::utils::conv::get_flops(
|
||||
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype =
|
||||
ck::conv_util::GetBtype<InDataType, WeiDataType, OutDataType>(params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
|
||||
params.N,
|
||||
params.C,
|
||||
params.K,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
@@ -13,6 +13,7 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
@@ -29,10 +30,8 @@ add_subdirectory(01_gemm)
|
||||
add_subdirectory(02_gemm_alpha_beta)
|
||||
add_subdirectory(03_gemm_bias_relu)
|
||||
add_subdirectory(04_gemm_bias_relu_add)
|
||||
add_subdirectory(05_conv2d_fwd)
|
||||
add_subdirectory(06_conv2d_fwd_bias_relu)
|
||||
add_subdirectory(07_conv2d_fwd_bias_relu_add)
|
||||
add_subdirectory(08_conv3d_fwd)
|
||||
add_subdirectory(09_convnd_fwd)
|
||||
add_subdirectory(10_conv2d_bwd_data)
|
||||
add_subdirectory(11_conv2d_bwd_weight)
|
||||
|
||||
@@ -1,242 +0,0 @@
|
||||
#ifndef CONV_UTILS_HPP
|
||||
#define CONV_UTILS_HPP
|
||||
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace conv_util {
|
||||
|
||||
/**
|
||||
* @brief Calculate number of FLOPs for Convolution
|
||||
*
|
||||
* @param[in] N Batch size.
|
||||
* @param[in] C Number of input channels.
|
||||
* @param[in] K Number of output channels.
|
||||
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
|
||||
* @param[in] output_spatial_lengths Convolution output spatial dimensions
|
||||
* lengths.
|
||||
*
|
||||
* @return The number of flops.
|
||||
*/
|
||||
std::size_t GetFlops(ck::index_t N,
|
||||
ck::index_t C,
|
||||
ck::index_t K,
|
||||
const std::vector<ck::index_t>& filter_spatial_lengths,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
|
||||
return static_cast<std::size_t>(2) * N * K *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()) *
|
||||
C *
|
||||
std::accumulate(std::begin(filter_spatial_lengths),
|
||||
std::end(filter_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate number of bytes read/write by convolution algorithm.
|
||||
*
|
||||
* @param[in] N Batch size.
|
||||
* @param[in] C Number of input channels.
|
||||
* @param[in] K Number of output channels.
|
||||
* @param[in] input_spatial_lengths Input spatial dimensions lengths.
|
||||
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
|
||||
* @param[in] output_spatial_lengths Output spatial dimensions lengths
|
||||
*
|
||||
* @tparam InDataType Input tensor data type.
|
||||
* @tparam WeiDataType Weights tensor data type.
|
||||
* @tparam OutDataType Output tensor data type.
|
||||
*
|
||||
* @return The number of used bytes.
|
||||
*/
|
||||
template <typename InDataType = float,
|
||||
typename WeiDataType = InDataType,
|
||||
typename OutDataType = InDataType>
|
||||
std::size_t GetBtype(ck::index_t N,
|
||||
ck::index_t C,
|
||||
ck::index_t K,
|
||||
const std::vector<ck::index_t>& input_spatial_lengths,
|
||||
const std::vector<ck::index_t>& filter_spatial_lengths,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (N * C * <input spatial lengths product>) +
|
||||
// sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
|
||||
// sizeof(OutDataType) * (N * K * <output spatial lengths product>);
|
||||
return sizeof(InDataType) * (N * C *
|
||||
std::accumulate(std::begin(input_spatial_lengths),
|
||||
std::end(input_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>())) +
|
||||
sizeof(WeiDataType) * (K * C *
|
||||
std::accumulate(std::begin(filter_spatial_lengths),
|
||||
std::end(filter_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>())) +
|
||||
sizeof(OutDataType) * (N * K *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
struct ConvParams
|
||||
{
|
||||
ConvParams()
|
||||
: num_dim_spatial(2),
|
||||
N(128),
|
||||
K(256),
|
||||
C(192),
|
||||
filter_spatial_lengths(2, 3),
|
||||
input_spatial_lengths(2, 71),
|
||||
conv_filter_strides(2, 2),
|
||||
conv_filter_dilations(2, 1),
|
||||
input_left_pads(2, 1),
|
||||
input_right_pads(2, 1)
|
||||
{
|
||||
}
|
||||
ConvParams(ck::index_t n_dim_spatial,
|
||||
ck::index_t n,
|
||||
ck::index_t k,
|
||||
ck::index_t c,
|
||||
std::vector<ck::index_t> filter_lengths,
|
||||
std::vector<ck::index_t> input_lengths,
|
||||
std::vector<ck::index_t> conv_strides,
|
||||
std::vector<ck::index_t> conv_dilations,
|
||||
std::vector<ck::index_t> left_pads,
|
||||
std::vector<ck::index_t> right_pads)
|
||||
: num_dim_spatial(n_dim_spatial),
|
||||
N(n),
|
||||
K(k),
|
||||
C(c),
|
||||
filter_spatial_lengths(filter_lengths),
|
||||
input_spatial_lengths(input_lengths),
|
||||
conv_filter_strides(conv_strides),
|
||||
conv_filter_dilations(conv_dilations),
|
||||
input_left_pads(left_pads),
|
||||
input_right_pads(right_pads)
|
||||
{
|
||||
}
|
||||
|
||||
ck::index_t num_dim_spatial;
|
||||
ck::index_t N;
|
||||
ck::index_t K;
|
||||
ck::index_t C;
|
||||
|
||||
std::vector<ck::index_t> filter_spatial_lengths;
|
||||
std::vector<ck::index_t> input_spatial_lengths;
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides;
|
||||
std::vector<ck::index_t> conv_filter_dilations;
|
||||
|
||||
std::vector<ck::index_t> input_left_pads;
|
||||
std::vector<ck::index_t> input_right_pads;
|
||||
|
||||
std::vector<ck::index_t> GetOutputSpatialLengths() const
|
||||
{
|
||||
std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
|
||||
for(ck::index_t i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck::index_t idx_eff =
|
||||
(filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
|
||||
out_spatial_len[i] =
|
||||
(input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
|
||||
conv_filter_strides[i] +
|
||||
1;
|
||||
}
|
||||
return out_spatial_len;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Gets the host tensor descriptor.
|
||||
*
|
||||
* @param[in] dims The tensor dimensions lengths. Always in NCHW format.
|
||||
* @param[in] layout The tensor data layout.
|
||||
*
|
||||
* @tparam TensorLayout Layout type.
|
||||
*
|
||||
* @return The host tensor descriptor object.
|
||||
*/
|
||||
template <typename TensorLayout>
|
||||
HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dims,
|
||||
const TensorLayout& layout)
|
||||
{
|
||||
std::size_t C = dims[1];
|
||||
// 1D
|
||||
if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], dims[2], 1}));
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], 1, C}));
|
||||
}
|
||||
// 2D
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCHW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCYX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(
|
||||
dims, std::vector<std::size_t>{C * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1});
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NHWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KYXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
dims, std::vector<std::size_t>{C * dims[2] * dims[3], 1, dims[3] * C, C});
|
||||
}
|
||||
// 3D
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCDHW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCZYX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKDHW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(dims,
|
||||
std::vector<std::size_t>{C * dims[2] * dims[3] * dims[4],
|
||||
dims[2] * dims[3] * dims[4],
|
||||
dims[3] * dims[4],
|
||||
dims[4],
|
||||
1});
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KZYXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
dims,
|
||||
std::vector<std::size_t>{
|
||||
C * dims[2] * dims[3] * dims[4], 1, dims[3] * dims[4] * C, dims[4] * C, C});
|
||||
}
|
||||
|
||||
std::stringstream err_msg;
|
||||
err_msg << "Unsupported data layout provided: " << layout << "!";
|
||||
throw std::runtime_error(err_msg.str());
|
||||
}
|
||||
|
||||
} // namespace conv_util
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,73 +0,0 @@
|
||||
#ifndef CONVOLUTION_UTILITY_HPP
|
||||
#define CONVOLUTION_UTILITY_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
|
||||
struct ConvolutionUtility
|
||||
{
|
||||
static std::vector<ck::index_t>
|
||||
ComputeOutputSpatialLengths(std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_strides,
|
||||
std::vector<ck::index_t> conv_dilations,
|
||||
std::vector<ck::index_t> in_left_pads,
|
||||
std::vector<ck::index_t> in_right_pads)
|
||||
{
|
||||
if(input_spatial_lengths.size() == 2)
|
||||
{
|
||||
assert(filter_spatial_lengths.size() == 2);
|
||||
assert(conv_strides.size() == 2);
|
||||
assert(conv_dilations.size() == 2);
|
||||
assert(in_left_pads.size() == 2);
|
||||
assert(in_right_pads.size() == 2);
|
||||
|
||||
const index_t YEff = (filter_spatial_lengths[0] - 1) * conv_dilations[0] + 1;
|
||||
const index_t XEff = (filter_spatial_lengths[1] - 1) * conv_dilations[1] + 1;
|
||||
|
||||
const index_t Hi = input_spatial_lengths[0];
|
||||
const index_t Wi = input_spatial_lengths[1];
|
||||
|
||||
const index_t Ho =
|
||||
(Hi + in_left_pads[0] + in_right_pads[0] - YEff) / conv_strides[0] + 1;
|
||||
const index_t Wo =
|
||||
(Wi + in_left_pads[1] + in_right_pads[1] - XEff) / conv_strides[1] + 1;
|
||||
|
||||
return {Ho, Wo};
|
||||
}
|
||||
else if(input_spatial_lengths.size() == 3)
|
||||
{
|
||||
assert(filter_spatial_lengths.size() == 3);
|
||||
assert(conv_strides.size() == 3);
|
||||
assert(conv_dilations.size() == 3);
|
||||
assert(in_left_pads.size() == 3);
|
||||
assert(in_right_pads.size() == 3);
|
||||
|
||||
const index_t ZEff = (filter_spatial_lengths[0] - 1) * conv_dilations[0] + 1;
|
||||
const index_t YEff = (filter_spatial_lengths[1] - 1) * conv_dilations[1] + 1;
|
||||
const index_t XEff = (filter_spatial_lengths[2] - 1) * conv_dilations[2] + 1;
|
||||
|
||||
const index_t Di = input_spatial_lengths[0];
|
||||
const index_t Hi = input_spatial_lengths[1];
|
||||
const index_t Wi = input_spatial_lengths[2];
|
||||
|
||||
const index_t Do =
|
||||
(Di + in_left_pads[0] + in_right_pads[0] - ZEff) / conv_strides[0] + 1;
|
||||
const index_t Ho =
|
||||
(Hi + in_left_pads[1] + in_right_pads[1] - YEff) / conv_strides[1] + 1;
|
||||
const index_t Wo =
|
||||
(Wi + in_left_pads[2] + in_right_pads[2] - XEff) / conv_strides[2] + 1;
|
||||
return {Do, Ho, Wo};
|
||||
}
|
||||
else
|
||||
{
|
||||
return {};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include "convolution_utility.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_conv_fwd.hpp"
|
||||
#include "common_header.hpp"
|
||||
@@ -53,36 +53,30 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
InElementwiseOperation in_element_op,
|
||||
WeiElementwiseOperation wei_element_op,
|
||||
OutElementwiseOperation out_element_op)
|
||||
: N_{N},
|
||||
K_{K},
|
||||
C_{C},
|
||||
in_spatial_lengths_{input_spatial_lengths},
|
||||
filter_spatial_lengths_{filter_spatial_lengths},
|
||||
: params_{3,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial_lengths,
|
||||
input_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads},
|
||||
out_spatial_lengths_{output_spatial_lengths},
|
||||
conv_filter_strides_{conv_filter_strides},
|
||||
conv_filter_dilations_{conv_filter_dilations},
|
||||
in_left_pads_{input_left_pads},
|
||||
in_right_pads_{input_right_pads},
|
||||
p_in_{p_in},
|
||||
p_wei_{p_wei},
|
||||
p_out_{p_out},
|
||||
in_element_op_{in_element_op},
|
||||
wei_element_op_{wei_element_op},
|
||||
out_element_op_{out_element_op}
|
||||
|
||||
{
|
||||
}
|
||||
|
||||
// private:
|
||||
index_t N_;
|
||||
index_t K_;
|
||||
index_t C_;
|
||||
std::vector<index_t> in_spatial_lengths_;
|
||||
std::vector<index_t> filter_spatial_lengths_;
|
||||
utils::conv::ConvParams params_;
|
||||
std::vector<index_t> out_spatial_lengths_;
|
||||
std::vector<index_t> conv_filter_strides_;
|
||||
std::vector<index_t> conv_filter_dilations_;
|
||||
std::vector<index_t> in_left_pads_;
|
||||
std::vector<index_t> in_right_pads_;
|
||||
|
||||
const InDataType* p_in_;
|
||||
const WeiDataType* p_wei_;
|
||||
@@ -157,13 +151,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
std::vector<index_t> out_spatial_lengths =
|
||||
ConvolutionUtility::ComputeOutputSpatialLengths(arg.in_spatial_lengths_,
|
||||
arg.filter_spatial_lengths_,
|
||||
arg.conv_filter_strides_,
|
||||
arg.conv_filter_dilations_,
|
||||
arg.in_left_pads_,
|
||||
arg.in_right_pads_);
|
||||
std::vector<index_t> out_spatial_lengths = arg.params_.GetOutputSpatialLengths();
|
||||
|
||||
bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] &&
|
||||
out_spatial_lengths[1] == arg.out_spatial_lengths_[1] &&
|
||||
|
||||
@@ -300,9 +300,6 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
|
||||
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
|
||||
|
||||
#if 1
|
||||
// FIXME: remove
|
||||
float bf16_to_f32_(ck::bhalf_t src_val);
|
||||
|
||||
// FIXME: remove
|
||||
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
|
||||
#endif
|
||||
@@ -353,28 +350,4 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
return linf_error;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
{
|
||||
bool has_error = false;
|
||||
int error_count = 0;
|
||||
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
if(ref.mData[i] != result.mData[i])
|
||||
{
|
||||
std::cerr << std::endl
|
||||
<< "Indices different at position " << i << " (ref: " << ref.mData[i]
|
||||
<< ", result: " << result.mData[i] << ")" << std::endl;
|
||||
has_error = true;
|
||||
error_count++;
|
||||
if(error_count == 20)
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
if(!has_error)
|
||||
std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#ifndef TEST_UTIL_HPP
|
||||
#define TEST_UTIL_HPP
|
||||
#ifndef CHECK_ERR_HPP
|
||||
#define CHECK_ERR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
@@ -13,16 +14,17 @@
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
namespace test {
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, ck::half_t>::value,
|
||||
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
|
||||
bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
const std::string& msg,
|
||||
double rtol = 1e-5,
|
||||
double atol = 1e-8)
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-5,
|
||||
double atol = 1e-8)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_same<T, ck::bhalf_t>::value || std::is_same<T, ck::half_t>::value,
|
||||
bool>::type
|
||||
typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
const std::string& msg,
|
||||
double rtol = 1e-5,
|
||||
double atol = 1e-8)
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -77,14 +78,15 @@ check_err(const std::vector<T>& out,
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = ck::type_convert<float>(ck::NumericLimits<T>::Min());
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
float o = ck::type_convert<float>(out[i]);
|
||||
float r = ck::type_convert<float>(ref[i]);
|
||||
err = std::abs(o - r);
|
||||
double o = type_convert<float>(out[i]);
|
||||
double r = type_convert<float>(ref[i]);
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
|
||||
return res;
|
||||
}
|
||||
|
||||
bool check_err(const std::vector<ck::half_t>& out,
|
||||
const std::vector<ck::half_t>& ref,
|
||||
const std::string& msg,
|
||||
ck::half_t rtol = static_cast<ck::half_t>(1e-3f),
|
||||
ck::half_t atol = static_cast<ck::half_t>(1e-3f))
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
|
||||
bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<ck::half_t>::min();
|
||||
double max_err = std::numeric_limits<T>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
double out_ = double(out[i]);
|
||||
double ref_ = double(ref[i]);
|
||||
err = std::abs(out_ - ref_);
|
||||
if(err > atol + rtol * std::abs(ref_) || !std::isfinite(out_) || !std::isfinite(ref_))
|
||||
double o = type_convert<float>(out[i]);
|
||||
double r = type_convert<float>(ref[i]);
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
|
||||
<< i << "]: " << out_ << "!=" << ref_ << std::endl
|
||||
<< i << "]: " << o << " != " << r << std::endl
|
||||
<< msg << std::endl;
|
||||
}
|
||||
res = false;
|
||||
@@ -149,13 +154,12 @@ bool check_err(const std::vector<ck::half_t>& out,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, ck::bhalf_t>::value,
|
||||
bool>::type
|
||||
typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
|
||||
check_err(const std::vector<T>& out,
|
||||
const std::vector<T>& ref,
|
||||
const std::string& msg,
|
||||
double = 0,
|
||||
double = 0)
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double = 0)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
|
||||
554
library/include/ck/library/utility/conv_fwd_util.hpp
Normal file
554
library/include/ck/library/utility/conv_fwd_util.hpp
Normal file
@@ -0,0 +1,554 @@
|
||||
#ifndef CONV_FWD_UTIL_HPP
|
||||
#define CONV_FWD_UTIL_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <random>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_conv_fwd.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
namespace conv {
|
||||
|
||||
using DeviceConvFwdNoOpPtr =
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
/**
|
||||
* @brief Calculate number of FLOPs for Convolution
|
||||
*
|
||||
* @param[in] N Batch size.
|
||||
* @param[in] C Number of input channels.
|
||||
* @param[in] K Number of output channels.
|
||||
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
|
||||
* @param[in] output_spatial_lengths Convolution output spatial dimensions
|
||||
* lengths.
|
||||
*
|
||||
* @return The number of flops.
|
||||
*/
|
||||
std::size_t get_flops(ck::index_t N,
|
||||
ck::index_t C,
|
||||
ck::index_t K,
|
||||
const std::vector<ck::index_t>& filter_spatial_lengths,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
|
||||
return static_cast<std::size_t>(2) * N * K *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()) *
|
||||
C *
|
||||
std::accumulate(std::begin(filter_spatial_lengths),
|
||||
std::end(filter_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate number of bytes read/write by convolution algorithm.
|
||||
*
|
||||
* @param[in] N Batch size.
|
||||
* @param[in] C Number of input channels.
|
||||
* @param[in] K Number of output channels.
|
||||
* @param[in] input_spatial_lengths Input spatial dimensions lengths.
|
||||
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
|
||||
* @param[in] output_spatial_lengths Output spatial dimensions lengths
|
||||
*
|
||||
* @tparam InDataType Input tensor data type.
|
||||
* @tparam WeiDataType Weights tensor data type.
|
||||
* @tparam OutDataType Output tensor data type.
|
||||
*
|
||||
* @return The number of used bytes.
|
||||
*/
|
||||
template <typename InDataType = float,
|
||||
typename WeiDataType = InDataType,
|
||||
typename OutDataType = InDataType>
|
||||
std::size_t get_btype(ck::index_t N,
|
||||
ck::index_t C,
|
||||
ck::index_t K,
|
||||
const std::vector<ck::index_t>& input_spatial_lengths,
|
||||
const std::vector<ck::index_t>& filter_spatial_lengths,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (N * C * <input spatial lengths product>) +
|
||||
// sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
|
||||
// sizeof(OutDataType) * (N * K * <output spatial lengths product>);
|
||||
return sizeof(InDataType) * (N * C *
|
||||
std::accumulate(std::begin(input_spatial_lengths),
|
||||
std::end(input_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>())) +
|
||||
sizeof(WeiDataType) * (K * C *
|
||||
std::accumulate(std::begin(filter_spatial_lengths),
|
||||
std::end(filter_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>())) +
|
||||
sizeof(OutDataType) * (N * K *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
struct ConvParams
|
||||
{
|
||||
ConvParams()
|
||||
: num_dim_spatial(2),
|
||||
N(128),
|
||||
K(256),
|
||||
C(192),
|
||||
filter_spatial_lengths(2, 3),
|
||||
input_spatial_lengths(2, 71),
|
||||
conv_filter_strides(2, 2),
|
||||
conv_filter_dilations(2, 1),
|
||||
input_left_pads(2, 1),
|
||||
input_right_pads(2, 1)
|
||||
{
|
||||
}
|
||||
|
||||
ConvParams(ck::index_t n_dim,
|
||||
ck::index_t n_batch,
|
||||
ck::index_t n_out_channels,
|
||||
ck::index_t n_in_channels,
|
||||
const std::vector<ck::index_t>& filters_len,
|
||||
const std::vector<ck::index_t>& input_len,
|
||||
const std::vector<ck::index_t>& strides,
|
||||
const std::vector<ck::index_t>& dilations,
|
||||
const std::vector<ck::index_t>& left_pads,
|
||||
const std::vector<ck::index_t>& right_pads)
|
||||
: num_dim_spatial(n_dim),
|
||||
N(n_batch),
|
||||
K(n_out_channels),
|
||||
C(n_in_channels),
|
||||
filter_spatial_lengths(filters_len),
|
||||
input_spatial_lengths(input_len),
|
||||
conv_filter_strides(strides),
|
||||
conv_filter_dilations(dilations),
|
||||
input_left_pads(left_pads),
|
||||
input_right_pads(right_pads)
|
||||
{
|
||||
if(filter_spatial_lengths.size() != num_dim_spatial ||
|
||||
input_spatial_lengths.size() != num_dim_spatial ||
|
||||
conv_filter_strides.size() != num_dim_spatial ||
|
||||
conv_filter_dilations.size() != num_dim_spatial ||
|
||||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParams::GetOutputSpatialLengths: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
}
|
||||
|
||||
ck::index_t num_dim_spatial;
|
||||
ck::index_t N;
|
||||
ck::index_t K;
|
||||
ck::index_t C;
|
||||
|
||||
std::vector<ck::index_t> filter_spatial_lengths;
|
||||
std::vector<ck::index_t> input_spatial_lengths;
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides;
|
||||
std::vector<ck::index_t> conv_filter_dilations;
|
||||
|
||||
std::vector<ck::index_t> input_left_pads;
|
||||
std::vector<ck::index_t> input_right_pads;
|
||||
|
||||
std::vector<ck::index_t> GetOutputSpatialLengths() const
|
||||
{
|
||||
if(filter_spatial_lengths.size() != num_dim_spatial ||
|
||||
input_spatial_lengths.size() != num_dim_spatial ||
|
||||
conv_filter_strides.size() != num_dim_spatial ||
|
||||
conv_filter_dilations.size() != num_dim_spatial ||
|
||||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParams::GetOutputSpatialLengths: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
|
||||
std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
|
||||
for(ck::index_t i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck::index_t idx_eff =
|
||||
(filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
|
||||
out_spatial_len[i] =
|
||||
(input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
|
||||
conv_filter_strides[i] +
|
||||
1;
|
||||
}
|
||||
return out_spatial_len;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Gets the host tensor descriptor.
|
||||
*
|
||||
* @param[in] dims The tensor dimensions lengths. Always in NCHW format.
|
||||
* @param[in] layout The tensor data layout.
|
||||
*
|
||||
* @tparam TensorLayout Layout type.
|
||||
*
|
||||
* @return The host tensor descriptor object.
|
||||
*/
|
||||
template <typename TensorLayout>
|
||||
HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
const TensorLayout& layout)
|
||||
{
|
||||
std::size_t C = dims[1];
|
||||
// 1D
|
||||
if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], dims[2], 1}));
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(dims, std::vector<std::size_t>({C * dims[2], 1, C}));
|
||||
}
|
||||
// 2D
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCHW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCYX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(
|
||||
dims, std::vector<std::size_t>{C * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1});
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NHWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KYXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
dims, std::vector<std::size_t>{C * dims[2] * dims[3], 1, dims[3] * C, C});
|
||||
}
|
||||
// 3D
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCDHW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCZYX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKDHW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(dims,
|
||||
std::vector<std::size_t>{C * dims[2] * dims[3] * dims[4],
|
||||
dims[2] * dims[3] * dims[4],
|
||||
dims[3] * dims[4],
|
||||
dims[4],
|
||||
1});
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KZYXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
dims,
|
||||
std::vector<std::size_t>{
|
||||
C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C});
|
||||
}
|
||||
|
||||
std::stringstream err_msg;
|
||||
err_msg << "Unsupported data layout provided: " << layout << "!";
|
||||
throw std::runtime_error(err_msg.str());
|
||||
}
|
||||
|
||||
template <typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float,
|
||||
typename InLayout = ck::tensor_layout::convolution::NHWC,
|
||||
typename WeiLayout = ck::tensor_layout::convolution::KYXC,
|
||||
typename OutLayout = ck::tensor_layout::convolution::NHWK>
|
||||
auto get_host_tensors(const ConvParams& params, bool init = true)
|
||||
{
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
|
||||
Tensor<WeiDataType> weights(
|
||||
ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
|
||||
Tensor<OutDataType> host_output(
|
||||
ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
|
||||
Tensor<OutDataType> device_output(
|
||||
ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
|
||||
|
||||
if(init)
|
||||
{
|
||||
std::mt19937 gen(11939);
|
||||
if constexpr(std::is_same<InDataType, uint8_t>::value)
|
||||
{
|
||||
std::uniform_int_distribution<> dis(-5, 5);
|
||||
std::generate(
|
||||
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
|
||||
std::generate(
|
||||
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
|
||||
}
|
||||
else
|
||||
{
|
||||
std::uniform_real_distribution<> dis(0.f, 1.f);
|
||||
std::generate(
|
||||
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
|
||||
std::generate(
|
||||
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
|
||||
}
|
||||
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
|
||||
std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
|
||||
}
|
||||
|
||||
return std::make_tuple(input, weights, host_output, device_output);
|
||||
}
|
||||
|
||||
HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
|
||||
int num_dim_spatial = 2)
|
||||
{
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
void run_reference_convolution_forward(const ConvParams& params,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
NDim>();
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
output,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float,
|
||||
template <ck::index_t, typename, typename, typename>
|
||||
class DeviceConvNDFwdInstance>
|
||||
void run_convolution_forward(const ConvParams& params,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
|
||||
auto conv = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Error! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
invoker.Run(argument);
|
||||
out_device_buf.FromDevice(output.mData.data());
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
bool run_convolution_forward_instances(const ConvParams& params,
|
||||
const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output,
|
||||
const Tensor<OutDataType>& host_output)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
|
||||
bool res{true};
|
||||
for(auto& conv_ptr : conv_ptrs)
|
||||
{
|
||||
auto invoker = conv_ptr->MakeInvokerPointer();
|
||||
auto argument = conv_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
if(conv_ptr->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
float atol{1e-5f};
|
||||
float rtol{1e-4f};
|
||||
if constexpr(std::is_same_v<InDataType, ck::half_t>)
|
||||
{
|
||||
atol = 1e-4f;
|
||||
rtol = 2.5e-3f;
|
||||
}
|
||||
invoker->Run(argument.get());
|
||||
out_device_buf.FromDevice(output.mData.data());
|
||||
res = res &&
|
||||
ck::utils::check_err(
|
||||
output.mData, host_output.mData, "Error: incorrect results!", atol, rtol);
|
||||
hipGetErrorString(
|
||||
hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
|
||||
}
|
||||
|
||||
#if 1
|
||||
// FIXME: remove
|
||||
float bf16_to_f32_(ck::bhalf_t src_val)
|
||||
{
|
||||
union
|
||||
{
|
||||
uint32_t int32;
|
||||
float fp32;
|
||||
} u = {uint32_t(src_val) << 16};
|
||||
return u.fp32;
|
||||
}
|
||||
|
||||
// FIXME: remove
|
||||
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
|
||||
{
|
||||
for(int i = 0; i < src.mData.size(); ++i)
|
||||
dst.mData[i] = bf16_to_f32_(src.mData[i]);
|
||||
dst.mData[i] = ck::type_convert<float>(src.mData[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(add_host, add_device);
|
||||
ck::utils::check_err(add_device.mData, add_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
layout);
|
||||
|
||||
check_error(in_host, in_device);
|
||||
ck::utils::check_err(in_device.mData, in_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
layout);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
ck::utils::check_err(out_device.mData, out_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
ck::utils::check_err(out_device.mData, out_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
check_error(max_host, max_device);
|
||||
ck::utils::check_err(out_device.mData, out_host.mData);
|
||||
ck::utils::check_err(max_device.mData, max_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -517,7 +519,7 @@ int main(int argc, char* argv[])
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
layout);
|
||||
|
||||
check_error(wei_host, wei_device);
|
||||
ck::utils::check_err(wei_device.mData, wei_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
@@ -441,7 +443,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
host_gemm(a, b, c_host, layout);
|
||||
|
||||
check_error(c_host, c_device);
|
||||
ck::utils::check_err(c_device.mData, c_host.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -15,6 +15,7 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
@@ -393,7 +394,6 @@ bool profile_batched_gemm_impl(int do_verification,
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
float err = check_error(c_g_m_n_host_result, c_g_m_n_device_result);
|
||||
pass = pass && (err < 1E-6);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -253,7 +255,8 @@ void profile_conv_bwd_data_impl(int do_verification,
|
||||
{
|
||||
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
|
||||
|
||||
check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
|
||||
ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
|
||||
in_n_c_hi_wi_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -245,7 +247,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -301,7 +302,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -233,7 +234,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -253,7 +255,8 @@ void profile_conv_fwd_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
@@ -68,13 +68,13 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, InLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, InLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, InLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
@@ -90,13 +90,13 @@ HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::s
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, WeiLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, WeiLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, WeiLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
@@ -112,13 +112,13 @@ HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::siz
|
||||
switch(num_dim_spatial)
|
||||
{
|
||||
case 3: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, OutLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
|
||||
}
|
||||
case 2: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, OutLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
|
||||
}
|
||||
case 1: {
|
||||
return ck::conv_util::GetHostTensorDescriptor(dims, OutLayout{});
|
||||
return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
|
||||
@@ -413,9 +413,10 @@ bool profile_convnd_bwd_data_impl(int do_verification,
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
|
||||
std::size_t flop =
|
||||
ck::conv_util::GetFlops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype = ck::conv_util::GetBtype<InDataType, WeiDataType, OutDataType>(
|
||||
N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
|
||||
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
|
||||
std::size_t num_btype =
|
||||
ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
|
||||
N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -283,7 +285,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -257,7 +259,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -236,7 +238,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
#include <iomanip>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -470,7 +472,7 @@ void profile_gemm_impl(int do_verification,
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
check_error(c_m_n_host_result, c_m_n_device_f32_result);
|
||||
ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
@@ -499,7 +501,7 @@ void profile_gemm_impl(int do_verification,
|
||||
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
check_error(c_m_n_host_result, c_m_n_device_result);
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
#include <iomanip>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -283,7 +285,7 @@ void profile_grouped_gemm_impl(int do_verification,
|
||||
c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
check_error(c_m_n_host_result, c_m_n_device_results[i]);
|
||||
ck::utils::check_err(c_m_n_device_results[i].mData, c_m_n_host_result.mData);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_instance.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
@@ -455,12 +457,13 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
if(do_verification)
|
||||
{
|
||||
out_dev.FromDevice(out.mData.data());
|
||||
check_error(out_ref, out);
|
||||
ck::utils::check_err(out.mData, out_ref.mData);
|
||||
|
||||
if(NeedIndices)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
check_indices(out_indices_ref, out_indices);
|
||||
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
|
||||
;
|
||||
};
|
||||
|
||||
if(do_log)
|
||||
@@ -577,12 +580,13 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
if(do_verification)
|
||||
{
|
||||
out_dev.FromDevice(out.mData.data());
|
||||
check_error(out_ref, out);
|
||||
ck::utils::check_err(out.mData, out_ref.mData);
|
||||
|
||||
if(NeedIndices)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
check_indices(out_indices_ref, out_indices);
|
||||
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
|
||||
;
|
||||
};
|
||||
|
||||
if(do_log)
|
||||
|
||||
@@ -32,10 +32,10 @@ enum struct ConvOutputLayout
|
||||
NKHW, // 0
|
||||
NHWK, // 1
|
||||
};
|
||||
ck::conv_util::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx)
|
||||
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx)
|
||||
{
|
||||
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
|
||||
params.num_dim_spatial = num_dim_spatial;
|
||||
params.N = std::stoi(argv[arg_idx++]);
|
||||
@@ -106,7 +106,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
|
||||
const bool do_log = std::stoi(argv[8]);
|
||||
const int nrepeat = std::stoi(argv[9]);
|
||||
|
||||
ck::conv_util::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
|
||||
ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
|
||||
|
||||
auto Run = [&](auto input_type, auto wei_type, auto out_type, auto acc_type) {
|
||||
using InDataType = decltype(input_type);
|
||||
|
||||
@@ -15,6 +15,7 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
|
||||
${PROJECT_SOURCE_DIR}/test/include
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "profile_batched_gemm_impl.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = ck::half_t;
|
||||
using BDataType = ck::half_t;
|
||||
|
||||
@@ -6,13 +6,13 @@
|
||||
#include <half.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "profile_conv_bwd_weight_impl.hpp"
|
||||
|
||||
int test_self()
|
||||
{
|
||||
bool pass = true;
|
||||
std::vector<ck::conv_util::ConvParams> params;
|
||||
std::vector<ck::utils::conv::ConvParams> params;
|
||||
|
||||
params.push_back({2, 128, 256, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
|
||||
params.push_back({2, 128, 256, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
|
||||
@@ -136,16 +136,16 @@ int main(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ck::conv_util::ConvParams param{2,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
{Y, X},
|
||||
{Hi, Wi},
|
||||
{conv_stride_h, conv_stride_w},
|
||||
{conv_dilation_h, conv_dilation_w},
|
||||
{in_left_pad_h, in_left_pad_w},
|
||||
{in_right_pad_h, in_right_pad_w}};
|
||||
ck::utils::conv::ConvParams param{2,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
{Y, X},
|
||||
{Hi, Wi},
|
||||
{conv_stride_h, conv_stride_w},
|
||||
{conv_dilation_h, conv_dilation_w},
|
||||
{in_left_pad_h, in_left_pad_w},
|
||||
{in_right_pad_h, in_right_pad_w}};
|
||||
if(data_type == 0)
|
||||
{
|
||||
pass = ck::profiler::profile_conv_bwd_weight_impl<2,
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
#include <vector>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "check_err.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
bool TestConvParams_GetOutputSpatialLengths()
|
||||
bool test_conv_params_get_output_spatial_lengths()
|
||||
{
|
||||
bool res{true};
|
||||
// -------------------------- default 2D ------------------------------------
|
||||
@@ -18,28 +18,28 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
// stride {2,2},
|
||||
// dilations {1,1},
|
||||
// padding {{1,1}, {1,1}}
|
||||
ck::conv_util::ConvParams conv_params;
|
||||
ck::utils::conv::ConvParams conv_params;
|
||||
std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{36, 36},
|
||||
"Error: ConvParams 2D default constructor.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{36, 36},
|
||||
"Error: ConvParams 2D default constructor.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
|
||||
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2};
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37, 37},
|
||||
"Error: ConvParams 2D padding left/right {2,2}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37, 37},
|
||||
"Error: ConvParams 2D padding left/right {2,2}.");
|
||||
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3};
|
||||
@@ -47,9 +47,10 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1};
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{23, 23},
|
||||
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
|
||||
res =
|
||||
ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{23, 23},
|
||||
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
|
||||
|
||||
// -------------------------- 1D ------------------------------------
|
||||
conv_params.num_dim_spatial = 1;
|
||||
@@ -61,24 +62,25 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D.");
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{1};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
|
||||
conv_params.input_left_pads = std::vector<ck::index_t>{2};
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37},
|
||||
"Error: ConvParams 1D padding left/right {2}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37},
|
||||
"Error: ConvParams 1D padding left/right {2}.");
|
||||
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{3};
|
||||
@@ -86,9 +88,9 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{23},
|
||||
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{23},
|
||||
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
|
||||
|
||||
// -------------------------- 3D ------------------------------------
|
||||
conv_params.num_dim_spatial = 3;
|
||||
@@ -100,35 +102,35 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{71, 71, 71},
|
||||
"Error: ConvParams 3D stride {1, 1, 1}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{71, 71, 71},
|
||||
"Error: ConvParams 3D stride {1, 1, 1}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
|
||||
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2, 2};
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37, 37, 37},
|
||||
"Error: ConvParams 3D padding left/right {2, 2, 2}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{37, 37, 37},
|
||||
"Error: ConvParams 3D padding left/right {2, 2, 2}.");
|
||||
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{36, 36, 36},
|
||||
"Error: ConvParams 3D dilation {2, 2, 2}.");
|
||||
res = ck::utils::check_err(out_spatial_len,
|
||||
std::vector<ck::index_t>{36, 36, 36},
|
||||
"Error: ConvParams 3D dilation {2, 2, 2}.");
|
||||
|
||||
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3, 3};
|
||||
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
|
||||
out_spatial_len = conv_params.GetOutputSpatialLengths();
|
||||
res = test::check_err(
|
||||
res = ck::utils::check_err(
|
||||
out_spatial_len,
|
||||
std::vector<ck::index_t>{23, 23, 23},
|
||||
"Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}.");
|
||||
@@ -136,50 +138,54 @@ bool TestConvParams_GetOutputSpatialLengths()
|
||||
return res;
|
||||
}
|
||||
|
||||
bool TestGetHostTensorDescriptor()
|
||||
bool test_get_host_tensor_descriptor()
|
||||
{
|
||||
bool res{true};
|
||||
namespace tl = ck::tensor_layout::convolution;
|
||||
std::vector<std::size_t> dims{2, 3, 4, 5};
|
||||
HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
|
||||
res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
|
||||
res = test::check_err(
|
||||
HostTensorDescriptor h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
|
||||
res =
|
||||
ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
|
||||
res = ck::utils::check_err(
|
||||
h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
|
||||
|
||||
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{});
|
||||
res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
|
||||
res = test::check_err(
|
||||
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCHW{});
|
||||
res =
|
||||
ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
|
||||
res = ck::utils::check_err(
|
||||
h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
|
||||
|
||||
dims = std::vector<std::size_t>{2, 3, 4};
|
||||
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
|
||||
res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
|
||||
res = test::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
|
||||
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
|
||||
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
|
||||
res =
|
||||
ck::utils::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
|
||||
|
||||
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{});
|
||||
res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
|
||||
res = test::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
|
||||
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{});
|
||||
res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
|
||||
res =
|
||||
ck::utils::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
|
||||
|
||||
dims = std::vector<std::size_t>{2, 3, 4, 5, 6};
|
||||
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
|
||||
res = test::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!");
|
||||
res = test::check_err(h.GetStrides(),
|
||||
{3 * 4 * 5 * 6, // N
|
||||
1, // C
|
||||
3 * 5 * 6, // D
|
||||
3 * 6, // H
|
||||
3}, // W
|
||||
"Error: wrong NDHWC dimensions strides!");
|
||||
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
|
||||
res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!");
|
||||
res = ck::utils::check_err(h.GetStrides(),
|
||||
{3 * 4 * 5 * 6, // N
|
||||
1, // C
|
||||
3 * 5 * 6, // D
|
||||
3 * 6, // H
|
||||
3}, // W
|
||||
"Error: wrong NDHWC dimensions strides!");
|
||||
|
||||
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCDHW{});
|
||||
res = test::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!");
|
||||
res = test::check_err(h.GetStrides(),
|
||||
{3 * 4 * 5 * 6, // N
|
||||
4 * 5 * 6, // C
|
||||
5 * 6, // D
|
||||
6, // H
|
||||
1}, // W
|
||||
"Error: wrong NCDHW dimensions strides!");
|
||||
h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{});
|
||||
res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!");
|
||||
res = ck::utils::check_err(h.GetStrides(),
|
||||
{3 * 4 * 5 * 6, // N
|
||||
4 * 5 * 6, // C
|
||||
5 * 6, // D
|
||||
6, // H
|
||||
1}, // W
|
||||
"Error: wrong NCDHW dimensions strides!");
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -188,10 +194,11 @@ bool TestGetHostTensorDescriptor()
|
||||
|
||||
int main(void)
|
||||
{
|
||||
bool res = TestConvParams_GetOutputSpatialLengths();
|
||||
std::cout << "TestConvParams_GetOutputSpatialLengths ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
bool res = test_conv_params_get_output_spatial_lengths();
|
||||
std::cout << "test_conv_params_get_output_spatial_lengths ..... "
|
||||
<< (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_get_host_tensor_descriptor();
|
||||
std::cout << "test_get_host_tensor_descriptor ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestGetHostTensorDescriptor();
|
||||
std::cout << "TestGetHostTensorDescriptor ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return res ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ int main()
|
||||
{
|
||||
bool pass = true;
|
||||
// check 1d
|
||||
std::vector<ck::conv_util::ConvParams> params;
|
||||
std::vector<ck::utils::conv::ConvParams> params;
|
||||
params.push_back({1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
|
||||
params.push_back({1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
|
||||
params.push_back({1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
|
||||
|
||||
@@ -5,10 +5,11 @@
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "conv_test_util.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "check_err.hpp"
|
||||
|
||||
// Forward declarations for conv instances.
|
||||
|
||||
@@ -34,10 +35,10 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConv
|
||||
|
||||
namespace {
|
||||
|
||||
bool TestConv1DNWC()
|
||||
bool test_conv1D_nwc()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 1;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
@@ -49,30 +50,31 @@ bool TestConv1DNWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
auto host_tensors =
|
||||
ck::utils::conv::get_host_tensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<1>(params, input, weights, host_output);
|
||||
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
|
||||
test::conv::RunConv<1>(params, input, weights, device_output);
|
||||
res = res &&
|
||||
test::check_err(
|
||||
ck::utils::check_err(
|
||||
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool TestConv1DNWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
bool test_conv1d_nwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
{
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 1;
|
||||
params.filter_spatial_lengths = std::vector<ck::index_t>{3};
|
||||
params.input_spatial_lengths = std::vector<ck::index_t>{71};
|
||||
@@ -81,51 +83,52 @@ bool TestConv1DNWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
params.input_left_pads = std::vector<ck::index_t>{1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
auto host_tensors =
|
||||
ck::utils::conv::get_host_tensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
const Tensor<T>& input = std::get<0>(host_tensors);
|
||||
const Tensor<T>& weights = std::get<1>(host_tensors);
|
||||
Tensor<T>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<T>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<1>(params, input, weights, host_output);
|
||||
return test::conv::RunConvInstances<1>(
|
||||
ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output);
|
||||
return ck::utils::conv::run_convolution_forward_instances<1>(
|
||||
params, conv_ptrs, input, weights, device_output, host_output);
|
||||
}
|
||||
bool TestConv1DNWCBF16Instances()
|
||||
bool test_conv1d_nwc_bf16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv1d_fwd_instance::
|
||||
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
|
||||
return TestConv1DNWCInstances<ck::bhalf_t>(conv_ptrs);
|
||||
return test_conv1d_nwc_instances<ck::bhalf_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv1DNWCF16Instances()
|
||||
bool test_conv1d_nwc_f16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv1d_fwd_instance::
|
||||
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
|
||||
return TestConv1DNWCInstances<ck::half_t>(conv_ptrs);
|
||||
return test_conv1d_nwc_instances<ck::half_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv1DNWCF32Instances()
|
||||
bool test_conv1d_nwc_f32_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv1d_fwd_instance::
|
||||
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
|
||||
return TestConv1DNWCInstances<float>(conv_ptrs);
|
||||
return test_conv1d_nwc_instances<float>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv1DNWCInt8Instances()
|
||||
bool test_conv1d_nwc_int8_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv1d_fwd_instance::
|
||||
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
|
||||
return TestConv1DNWCInstances<int8_t>(conv_ptrs);
|
||||
return test_conv1d_nwc_instances<int8_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
@@ -133,18 +136,20 @@ bool TestConv1DNWCInt8Instances()
|
||||
int main()
|
||||
{
|
||||
bool res{true};
|
||||
res = TestConv1DNWC();
|
||||
std::cout << "TestConv1DNWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv1D_nwc();
|
||||
std::cout << "test_conv1D_nwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
|
||||
res = TestConv1DNWCBF16Instances();
|
||||
res = test_conv1d_nwc_bf16_instances();
|
||||
std::cout << "\nTestConv1DNWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv1DNWCF16Instances();
|
||||
std::cout << "\nTestConv1DNWCF16Instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv1DNWCF32Instances();
|
||||
std::cout << "\nTestConv1DNWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv1DNWCInt8Instances();
|
||||
std::cout << "\nTestConv1DNWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv1d_nwc_f16_instances();
|
||||
std::cout << "\ntest_conv1d_nwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = test_conv1d_nwc_f32_instances();
|
||||
std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = test_conv1d_nwc_int8_instances();
|
||||
std::cout << "\ntes_tconv1_dnw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
|
||||
return res ? 0 : 1;
|
||||
|
||||
@@ -6,10 +6,11 @@
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "conv_test_util.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "check_err.hpp"
|
||||
|
||||
// Forward declarations for conv instances.
|
||||
using DeviceConvFwdNoOpPtr =
|
||||
@@ -36,35 +37,35 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceC
|
||||
|
||||
namespace {
|
||||
|
||||
bool TestConv2DNHWC()
|
||||
bool test_conv2d_nhwc()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
params.C = 4;
|
||||
params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
|
||||
params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors(params);
|
||||
auto host_tensors = ck::utils::conv::get_host_tensors(params);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<2>(params, input, weights, host_output);
|
||||
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
|
||||
test::conv::RunConv<2>(params, input, weights, device_output);
|
||||
res = res &&
|
||||
test::check_err(
|
||||
ck::utils::check_err(
|
||||
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool TestConv2DNHWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
bool test_conv2d_nhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
{
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 2;
|
||||
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
|
||||
params.input_spatial_lengths = std::vector<ck::index_t>{71, 71};
|
||||
@@ -73,54 +74,55 @@ bool TestConv2DNHWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
params.input_left_pads = std::vector<ck::index_t>{1, 1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NHWC,
|
||||
ck::tensor_layout::convolution::KYXC,
|
||||
ck::tensor_layout::convolution::NHWK>(params);
|
||||
auto host_tensors =
|
||||
ck::utils::conv::get_host_tensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NHWC,
|
||||
ck::tensor_layout::convolution::KYXC,
|
||||
ck::tensor_layout::convolution::NHWK>(params);
|
||||
const Tensor<T>& input = std::get<0>(host_tensors);
|
||||
const Tensor<T>& weights = std::get<1>(host_tensors);
|
||||
Tensor<T>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<T>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<2>(params, input, weights, host_output);
|
||||
return test::conv::RunConvInstances<2>(
|
||||
ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output);
|
||||
return ck::utils::conv::run_convolution_forward_instances<2>(
|
||||
params, conv_ptrs, input, weights, device_output, host_output);
|
||||
}
|
||||
|
||||
bool TestConv2DNHWCBF16Instances()
|
||||
bool test_conv2d_nhwc_bf16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv2d_fwd_instance::
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
|
||||
return TestConv2DNHWCInstances<ck::bhalf_t>(conv_ptrs);
|
||||
return test_conv2d_nhwc_instances<ck::bhalf_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv2DNHWCF16Instances()
|
||||
bool test_conv2d_nhwc_f16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv2d_fwd_instance::
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
|
||||
ck::tensor_operation::device::device_conv2d_fwd_instance::
|
||||
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
|
||||
return TestConv2DNHWCInstances<ck::half_t>(conv_ptrs);
|
||||
return test_conv2d_nhwc_instances<ck::half_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv2DNHWCF32Instances()
|
||||
bool test_conv2d_nhwc_f32_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv2d_fwd_instance::
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
|
||||
return TestConv2DNHWCInstances<float>(conv_ptrs);
|
||||
return test_conv2d_nhwc_instances<float>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv2DNHWCInt8Instances()
|
||||
bool test_conv2d_nhwc_int8_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv2d_fwd_instance::
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
|
||||
return TestConv2DNHWCInstances<int8_t>(conv_ptrs);
|
||||
return test_conv2d_nhwc_instances<int8_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
@@ -128,19 +130,20 @@ bool TestConv2DNHWCInt8Instances()
|
||||
int main()
|
||||
{
|
||||
bool res{true};
|
||||
res = TestConv2DNHWC();
|
||||
std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv2d_nhwc();
|
||||
std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
|
||||
res = TestConv2DNHWCBF16Instances();
|
||||
std::cout << "\nTestConv2DNHWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv2d_nhwc_bf16_instances();
|
||||
std::cout << "\ntest_conv2d_nhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv2DNHWCF16Instances();
|
||||
std::cout << "\nTestConv2DNHWCF16Instances ....." << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv2DNHWCF32Instances();
|
||||
std::cout << "\nTestConv2DNHWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv2d_nhwc_f16_instances();
|
||||
std::cout << "\ntest_conv2d_nhwc_f16_instances ....." << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv2DNHWCInt8Instances();
|
||||
std::cout << "\nTestConv2DNHWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv2d_nhwc_f32_instances();
|
||||
std::cout << "\ntest_conv2d_nhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = test_conv2d_nhwc_int8_instances();
|
||||
std::cout << "\ntest_conv2d_nhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
|
||||
return res ? 0 : 1;
|
||||
|
||||
@@ -6,10 +6,11 @@
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "conv_test_util.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "check_err.hpp"
|
||||
|
||||
// Forward declarations for conv instances.
|
||||
using DeviceConvFwdNoOpPtr =
|
||||
@@ -34,10 +35,10 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<Devi
|
||||
|
||||
namespace {
|
||||
|
||||
bool TestConv3DNDHWC()
|
||||
bool test_conv3d_ndhwc()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 3;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
@@ -49,30 +50,31 @@ bool TestConv3DNDHWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params);
|
||||
auto host_tensors =
|
||||
ck::utils::conv::get_host_tensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<3>(params, input, weights, host_output);
|
||||
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output);
|
||||
test::conv::RunConv<3>(params, input, weights, device_output);
|
||||
res = res &&
|
||||
test::check_err(
|
||||
ck::utils::check_err(
|
||||
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWC2GBInput()
|
||||
bool test_conv3d_ndhwc_2gb_input()
|
||||
{
|
||||
// >2GB Input
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 3;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
@@ -85,12 +87,12 @@ bool TestConv3DNDHWC2GBInput()
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
|
||||
auto host_tensors =
|
||||
test::conv::GetHostTensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
ck::utils::conv::get_host_tensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
@@ -113,10 +115,10 @@ bool TestConv3DNDHWC2GBInput()
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWC2GBFilters()
|
||||
bool test_conv3d_ndhwc_2gb_filters()
|
||||
{
|
||||
// >2GB Filters
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 3;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
@@ -129,12 +131,12 @@ bool TestConv3DNDHWC2GBFilters()
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
|
||||
auto host_tensors =
|
||||
test::conv::GetHostTensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
ck::utils::conv::get_host_tensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
@@ -157,10 +159,10 @@ bool TestConv3DNDHWC2GBFilters()
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWC2GBOutput()
|
||||
bool test_conv3d_ndhwc_2gb_output()
|
||||
{
|
||||
// >2GB Output
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 3;
|
||||
params.N = 2;
|
||||
params.K = 16;
|
||||
@@ -173,12 +175,12 @@ bool TestConv3DNDHWC2GBOutput()
|
||||
params.input_right_pads = std::vector<ck::index_t>{2, 2, 2};
|
||||
|
||||
auto host_tensors =
|
||||
test::conv::GetHostTensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
ck::utils::conv::get_host_tensors<float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params, false);
|
||||
const Tensor<float>& input = std::get<0>(host_tensors);
|
||||
const Tensor<float>& weights = std::get<1>(host_tensors);
|
||||
Tensor<float>& device_output = std::get<3>(host_tensors);
|
||||
@@ -202,9 +204,9 @@ bool TestConv3DNDHWC2GBOutput()
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool TestConv3DNDHWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
bool test_conv3d_ndhwc_instances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs)
|
||||
{
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.N = 64;
|
||||
params.num_dim_spatial = 3;
|
||||
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
|
||||
@@ -214,52 +216,53 @@ bool TestConv3DNDHWCInstances(const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs
|
||||
params.input_left_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1, 1};
|
||||
|
||||
auto host_tensors = test::conv::GetHostTensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params);
|
||||
auto host_tensors =
|
||||
ck::utils::conv::get_host_tensors<T,
|
||||
T,
|
||||
T,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK>(params);
|
||||
const Tensor<T>& input = std::get<0>(host_tensors);
|
||||
const Tensor<T>& weights = std::get<1>(host_tensors);
|
||||
Tensor<T>& host_output = std::get<2>(host_tensors);
|
||||
Tensor<T>& device_output = std::get<3>(host_tensors);
|
||||
|
||||
test::conv::RunReferenceConv<3>(params, input, weights, host_output);
|
||||
return test::conv::RunConvInstances<3>(
|
||||
ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output);
|
||||
return ck::utils::conv::run_convolution_forward_instances<3>(
|
||||
params, conv_ptrs, input, weights, device_output, host_output);
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWCBF16Instances()
|
||||
bool test_conv3d_ndhwc_bf16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv3d_fwd_instance::
|
||||
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
|
||||
return TestConv3DNDHWCInstances<ck::bhalf_t>(conv_ptrs);
|
||||
return test_conv3d_ndhwc_instances<ck::bhalf_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWCF16Instances()
|
||||
bool test_conv3d_ndhwc_f16_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv3d_fwd_instance::
|
||||
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
|
||||
return TestConv3DNDHWCInstances<ck::half_t>(conv_ptrs);
|
||||
return test_conv3d_ndhwc_instances<ck::half_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWCF32Instances()
|
||||
bool test_conv3d_ndhwc_f32_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv3d_fwd_instance::
|
||||
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
|
||||
return TestConv3DNDHWCInstances<float>(conv_ptrs);
|
||||
return test_conv3d_ndhwc_instances<float>(conv_ptrs);
|
||||
}
|
||||
|
||||
bool TestConv3DNDHWCInt8Instances()
|
||||
bool test_conv3d_ndhwc_int8_instances()
|
||||
{
|
||||
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
|
||||
ck::tensor_operation::device::device_conv3d_fwd_instance::
|
||||
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
|
||||
return TestConv3DNDHWCInstances<int8_t>(conv_ptrs);
|
||||
return test_conv3d_ndhwc_instances<int8_t>(conv_ptrs);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
@@ -267,27 +270,30 @@ bool TestConv3DNDHWCInt8Instances()
|
||||
int main()
|
||||
{
|
||||
bool res{true};
|
||||
res = TestConv3DNDHWC();
|
||||
std::cout << "TestConv3DNDHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv3d_ndhwc();
|
||||
std::cout << "test_conv3d_ndhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
|
||||
res = TestConv3DNDHWC2GBInput();
|
||||
std::cout << "\nTestConv3DNDHWC2GBInput ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv3DNDHWC2GBFilters();
|
||||
std::cout << "\nTestConv3DNDHWC2GBFilters ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv3DNDHWC2GBOutput();
|
||||
std::cout << "\nTestConv3DNDHWC2GBOutput ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv3d_ndhwc_2gb_input();
|
||||
std::cout << "\ntest_conv3d_ndhwc_2gb_input ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = test_conv3d_ndhwc_2gb_filters();
|
||||
std::cout << "\ntest_conv3d_ndhwc_2gb_filters ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = test_conv3d_ndhwc_2gb_output();
|
||||
std::cout << "\ntest_conv3d_ndhwc_2gb_output ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
|
||||
res = TestConv3DNDHWCBF16Instances();
|
||||
std::cout << "\nTestConv3DNDHWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv3d_ndhwc_bf16_instances();
|
||||
std::cout << "\ntest_conv3d_ndhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv3DNDHWCF16Instances();
|
||||
std::cout << "\nTestConv3DNDHWCF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv3d_ndhwc_f16_instances();
|
||||
std::cout << "\ntest_conv3d_ndhwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv3DNDHWCF32Instances();
|
||||
std::cout << "\nTestConv3DNDHWCF32Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv3d_ndhwc_f32_instances();
|
||||
std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
res = TestConv3DNDHWCInt8Instances();
|
||||
std::cout << "\nTestConv3DNDHWCInt8Instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
res = test_conv3d_ndhwc_int8_instances();
|
||||
std::cout << "\ntest_conv3d_ndhw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE")
|
||||
<< std::endl;
|
||||
|
||||
return res ? 0 : 1;
|
||||
|
||||
90
test/convnd_fwd/conv_util.hpp
Normal file
90
test/convnd_fwd/conv_util.hpp
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef TEST_CONV_UTIL_HPP
|
||||
#define TEST_CONV_UTIL_HPP
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "sequence.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
template <ck::index_t SpatialDims, typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
// clang-format off
|
||||
InDataType, //
|
||||
WeiDataType, //
|
||||
OutDataType, //
|
||||
InDataType, //
|
||||
InElementOp, // Input Elementwise Operation
|
||||
WeiElementOp, // Weights Elementwise Operation
|
||||
OutElementOp, // Output Elementwise Operation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
SpatialDims, // SptialDims
|
||||
64, // BlockSize
|
||||
16, // MPerBlock
|
||||
16, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
1, // K1
|
||||
16, // MPerXDL
|
||||
16, // NPerXDL
|
||||
1, // MXdlPerWave
|
||||
1, // NXdlPerWave
|
||||
S<1, 16, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
1, // ABlockTransferSrcScalarPerVector
|
||||
1, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
1, // BBlockTransferSrcScalarPerVector
|
||||
1, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockTransferAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
// clang-format on
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace test {
|
||||
namespace conv {
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
void RunConv(const ck::utils::conv::ConvParams& params,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output)
|
||||
{
|
||||
ck::utils::conv::run_convolution_forward<NDim,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
DeviceConvNDFwdInstance>(
|
||||
params, input, weights, output);
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace test
|
||||
|
||||
#endif
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#ifndef GEMM_UTILS_HPP
|
||||
#define GEMM_UTILS_HPP
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace gemm_util {
|
||||
@@ -202,20 +202,17 @@ struct TestGemm
|
||||
bool res = false;
|
||||
if(std::is_same<CDataType, float>::value)
|
||||
{
|
||||
res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!");
|
||||
|
||||
res = ck::utils::check_err(c_device.mData, c_host.mData);
|
||||
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
}
|
||||
else if(std::is_same<CDataType, ck::half_t>::value)
|
||||
{
|
||||
res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!");
|
||||
|
||||
res = ck::utils::check_err(c_device.mData, c_host.mData);
|
||||
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
}
|
||||
else if(std::is_same<CDataType, int8_t>::value)
|
||||
{
|
||||
res = test::check_err(c_device.mData, c_host.mData, "Error: incorrect results!");
|
||||
|
||||
res = ck::utils::check_err(c_device.mData, c_host.mData);
|
||||
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
}
|
||||
|
||||
@@ -330,9 +327,8 @@ struct TestGemmBF16
|
||||
bf16_to_f32_(c_device_bf16, c_device_fp32);
|
||||
|
||||
// Assert
|
||||
bool res = test::check_err(
|
||||
bool res = ck::utils::check_err(
|
||||
c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
|
||||
|
||||
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
|
||||
return res;
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -15,7 +17,6 @@
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -46,24 +47,6 @@ using ALayout = ck::tensor_layout::gemm::RowMajor;
|
||||
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
template <typename T>
|
||||
static bool check_err(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
{
|
||||
float max_diff = 1e-2;
|
||||
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
{
|
||||
std::cout << double(ref.mData[i]) << "," << double(result.mData[i]) << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
|
||||
{
|
||||
int group_count = rand() % 10 + 1;
|
||||
@@ -188,7 +171,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
bool res = check_err(c_device_tensors[i], c_host_tensors[i]);
|
||||
bool res = ck::utils::check_err(c_host_tensors[i].mData, c_device_tensors[i].mData);
|
||||
|
||||
std::cout << "group_id: " << i << (res ? " SUCCESS" : " FAILURE") << std::endl;
|
||||
|
||||
|
||||
@@ -1,289 +0,0 @@
|
||||
#ifndef TEST_CONV_UTIL_HPP
|
||||
#define TEST_CONV_UTIL_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
template <ck::index_t SpatialDims, typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
// clang-format off
|
||||
InDataType, //
|
||||
WeiDataType, //
|
||||
OutDataType, //
|
||||
InDataType, //
|
||||
InElementOp, // Input Elementwise Operation
|
||||
WeiElementOp, // Weights Elementwise Operation
|
||||
OutElementOp, // Output Elementwise Operation
|
||||
ConvFwdDefault, // ConvForwardSpecialization
|
||||
SpatialDims, // SptialDims
|
||||
64, // BlockSize
|
||||
16, // MPerBlock
|
||||
16, // NPerBlock
|
||||
4, // K0PerBlock
|
||||
1, // K1
|
||||
16, // MPerXDL
|
||||
16, // NPerXDL
|
||||
1, // MXdlPerWave
|
||||
1, // NXdlPerWave
|
||||
S<1, 16, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
1, // ABlockTransferSrcScalarPerVector
|
||||
1, // ABlockTransferDstScalarPerVector_K1
|
||||
true, // ABlockLdsAddExtraM
|
||||
S<1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
1, // BBlockTransferSrcScalarPerVector
|
||||
1, // BBlockTransferDstScalarPerVector_K1
|
||||
true, // BBlockTransferAddExtraN
|
||||
7, // CThreadTransferSrcDstVectorDim
|
||||
1>; // CThreadTransferDstScalarPerVector
|
||||
// clang-format on
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace test {
|
||||
namespace conv {
|
||||
|
||||
using DeviceConvFwdNoOpPtr =
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
template <typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float,
|
||||
typename InLayout = ck::tensor_layout::convolution::NHWC,
|
||||
typename WeiLayout = ck::tensor_layout::convolution::KYXC,
|
||||
typename OutLayout = ck::tensor_layout::convolution::NHWK>
|
||||
auto GetHostTensors(const ck::conv_util::ConvParams& params, bool init = true)
|
||||
{
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
input_dims.insert(std::end(input_dims),
|
||||
std::begin(params.input_spatial_lengths),
|
||||
std::end(params.input_spatial_lengths));
|
||||
|
||||
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
filter_dims.insert(std::end(filter_dims),
|
||||
std::begin(params.filter_spatial_lengths),
|
||||
std::end(params.filter_spatial_lengths));
|
||||
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.K)};
|
||||
output_dims.insert(std::end(output_dims),
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
|
||||
Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
|
||||
Tensor<OutDataType> host_output(
|
||||
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
|
||||
Tensor<OutDataType> device_output(
|
||||
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
|
||||
|
||||
if(init)
|
||||
{
|
||||
std::mt19937 gen(11939);
|
||||
if constexpr(std::is_same<InDataType, uint8_t>::value)
|
||||
{
|
||||
std::uniform_int_distribution<> dis(-5, 5);
|
||||
std::generate(
|
||||
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
|
||||
std::generate(
|
||||
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
|
||||
}
|
||||
else
|
||||
{
|
||||
std::uniform_real_distribution<> dis(0.f, 1.f);
|
||||
std::generate(
|
||||
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
|
||||
std::generate(
|
||||
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
|
||||
}
|
||||
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
|
||||
std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
|
||||
}
|
||||
|
||||
return std::make_tuple(input, weights, host_output, device_output);
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
void RunReferenceConv(const ck::conv_util::ConvParams& params,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output)
|
||||
{
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
NDim>();
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weights,
|
||||
output,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
void RunConv(const ck::conv_util::ConvParams& params,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output)
|
||||
{
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
|
||||
auto conv = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Error! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
invoker.Run(argument);
|
||||
out_device_buf.FromDevice(output.mData.data());
|
||||
}
|
||||
|
||||
template <ck::index_t NDim,
|
||||
typename InDataType = float,
|
||||
typename WeiDataType = float,
|
||||
typename OutDataType = float>
|
||||
bool RunConvInstances(const ck::conv_util::ConvParams& params,
|
||||
const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs,
|
||||
const Tensor<InDataType>& input,
|
||||
const Tensor<WeiDataType>& weights,
|
||||
Tensor<OutDataType>& output,
|
||||
const Tensor<OutDataType>& host_output)
|
||||
{
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
|
||||
|
||||
in_device_buf.ToDevice(input.mData.data());
|
||||
wei_device_buf.ToDevice(weights.mData.data());
|
||||
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
|
||||
|
||||
bool res{true};
|
||||
for(auto& conv_ptr : conv_ptrs)
|
||||
{
|
||||
auto invoker = conv_ptr->MakeInvokerPointer();
|
||||
auto argument = conv_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
params.N,
|
||||
params.K,
|
||||
params.C,
|
||||
params.input_spatial_lengths,
|
||||
params.filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
params.conv_filter_strides,
|
||||
params.conv_filter_dilations,
|
||||
params.input_left_pads,
|
||||
params.input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(conv_ptr->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
float atol{1e-5f};
|
||||
float rtol{1e-4f};
|
||||
if constexpr(std::is_same_v<InDataType, ck::half_t>)
|
||||
{
|
||||
atol = 1e-4f;
|
||||
rtol = 2.5e-3f;
|
||||
}
|
||||
invoker->Run(argument.get());
|
||||
out_device_buf.FromDevice(output.mData.data());
|
||||
res = res &&
|
||||
test::check_err(
|
||||
output.mData, host_output.mData, "Error: incorrect results!", atol, rtol);
|
||||
hipGetErrorString(
|
||||
hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace test
|
||||
|
||||
#endif
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "magic_division.hpp"
|
||||
#include "device.hpp"
|
||||
@@ -54,29 +56,6 @@ __host__ void cpu_magic_number_division(uint32_t magic_multiplier,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T check_error(const std::vector<T>& ref, const std::vector<T>& result)
|
||||
{
|
||||
T error = 0;
|
||||
T max_diff = 0;
|
||||
T ref_value = 0, result_value = 0;
|
||||
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
T diff = std::abs(ref[i] - result[i]);
|
||||
error += diff;
|
||||
|
||||
if(max_diff < diff)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = ref[i];
|
||||
result_value = result[i];
|
||||
}
|
||||
}
|
||||
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
int main(int, char*[])
|
||||
{
|
||||
uint64_t num_divisor = 4096;
|
||||
@@ -135,9 +114,9 @@ int main(int, char*[])
|
||||
naive_result_dev_buf.FromDevice(naive_result_host.data());
|
||||
magic_result_dev_buf.FromDevice(magic_result_host.data());
|
||||
|
||||
int32_t max_diff = check_error(naive_result_host, magic_result_host);
|
||||
bool res = ck::utils::check_err(magic_result_host, naive_result_host);
|
||||
|
||||
if(max_diff != 0)
|
||||
if(!res)
|
||||
{
|
||||
pass = false;
|
||||
continue;
|
||||
@@ -149,9 +128,9 @@ int main(int, char*[])
|
||||
magic_result_host2.data(),
|
||||
num_dividend);
|
||||
|
||||
max_diff = check_error(naive_result_host, magic_result_host2);
|
||||
res = ck::utils::check_err(magic_result_host2, naive_result_host);
|
||||
|
||||
if(max_diff != 0)
|
||||
if(!res)
|
||||
{
|
||||
pass = false;
|
||||
continue;
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#include "getopt.h"
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "device_reduce_instance.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_reduction.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "reduce_util.hpp"
|
||||
|
||||
using namespace ck;
|
||||
@@ -289,13 +290,13 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
{
|
||||
reduce_util::to_f32_vector(out, out_fp32);
|
||||
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
|
||||
single_result = test::check_err(
|
||||
single_result = ck::utils::check_err(
|
||||
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
|
||||
}
|
||||
else
|
||||
{
|
||||
single_result =
|
||||
test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
};
|
||||
|
||||
if(!single_result)
|
||||
@@ -376,13 +377,13 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
{
|
||||
reduce_util::to_f32_vector(out, out_fp32);
|
||||
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
|
||||
single_result = test::check_err(
|
||||
single_result = ck::utils::check_err(
|
||||
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
|
||||
}
|
||||
else
|
||||
{
|
||||
single_result =
|
||||
test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
};
|
||||
|
||||
if(!single_result)
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_reduction.hpp"
|
||||
#include "test_util.hpp"
|
||||
#include "check_err.hpp"
|
||||
#include "reduce_util.hpp"
|
||||
|
||||
using namespace ck;
|
||||
@@ -273,21 +273,21 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
{
|
||||
reduce_util::to_f32_vector(out, out_fp32);
|
||||
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
|
||||
single_result = test::check_err(
|
||||
single_result = ck::utils::check_err(
|
||||
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
|
||||
}
|
||||
else
|
||||
{
|
||||
single_result =
|
||||
test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
};
|
||||
|
||||
if(NeedIndices)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
single_result = single_result && test::check_err(out_indices_ref.mData,
|
||||
out_indices.mData,
|
||||
"Error: incorrect index result!");
|
||||
single_result = single_result && ck::utils::check_err(out_indices_ref.mData,
|
||||
out_indices.mData,
|
||||
"Error: incorrect index result!");
|
||||
};
|
||||
|
||||
if(!single_result)
|
||||
@@ -370,21 +370,22 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
{
|
||||
reduce_util::to_f32_vector(out, out_fp32);
|
||||
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
|
||||
single_result = test::check_err(
|
||||
single_result = ck::utils::check_err(
|
||||
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
|
||||
}
|
||||
else
|
||||
{
|
||||
single_result =
|
||||
test::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
|
||||
};
|
||||
|
||||
if(NeedIndices)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
single_result = single_result && test::check_err(out_indices_ref.mData,
|
||||
out_indices.mData,
|
||||
"Error: incorrect index result!");
|
||||
single_result =
|
||||
single_result && ck::utils::check_err(out_indices_ref.mData,
|
||||
out_indices.mData,
|
||||
"Error: incorrect index result!");
|
||||
};
|
||||
|
||||
if(!single_result)
|
||||
|
||||
@@ -6,13 +6,13 @@
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_utils.hpp"
|
||||
#include "conv_fwd_util.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "test_util.hpp"
|
||||
|
||||
namespace {
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
@@ -57,9 +57,10 @@ template <ck::index_t NDim,
|
||||
typename OutLayout = ck::tensor_layout::convolution::NHWK,
|
||||
typename FillInputOp = FillMonotonicSeq<InDataType>,
|
||||
typename FillWeightsOp = FillConstant<WeiDataType>>
|
||||
Tensor<OutDataType> RunReferenceConv(const ck::conv_util::ConvParams& params,
|
||||
const FillInputOp& fill_input_op = FillInputOp{},
|
||||
const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
|
||||
Tensor<OutDataType>
|
||||
run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
|
||||
const FillInputOp& fill_input_op = FillInputOp{},
|
||||
const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
|
||||
{
|
||||
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
|
||||
static_cast<std::size_t>(params.C)};
|
||||
@@ -80,18 +81,16 @@ Tensor<OutDataType> RunReferenceConv(const ck::conv_util::ConvParams& params,
|
||||
std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths));
|
||||
|
||||
Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
|
||||
Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
|
||||
Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
|
||||
Tensor<WeiDataType> weights(
|
||||
ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
|
||||
Tensor<OutDataType> host_output(
|
||||
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
|
||||
ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
|
||||
|
||||
fill_input_op(input.begin(), input.end());
|
||||
fill_weights_op(weights.begin(), weights.end());
|
||||
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
|
||||
|
||||
// std::cout <<"input: " << input.mDesc << std::endl << input.mData << std::endl;
|
||||
// std::cout <<"weight: " << weights.mDesc << std::endl << weights.mData << std::endl;
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
@@ -116,10 +115,10 @@ Tensor<OutDataType> RunReferenceConv(const ck::conv_util::ConvParams& params,
|
||||
return host_output;
|
||||
}
|
||||
|
||||
bool TestConv2DNHWC()
|
||||
bool test_conv2d_nhwc()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.N = 1;
|
||||
params.K = 1;
|
||||
params.C = 2;
|
||||
@@ -130,7 +129,7 @@ bool TestConv2DNHWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{0, 0};
|
||||
params.input_right_pads = std::vector<ck::index_t>{0, 0};
|
||||
|
||||
auto out_tensor = RunReferenceConv<2>(params);
|
||||
auto out_tensor = run_reference_convolution_forward<2>(params);
|
||||
std::vector<std::size_t> ref_dims{1, 1, 4, 4};
|
||||
std::vector<float> ref_data{130.5,
|
||||
148.5,
|
||||
@@ -148,10 +147,10 @@ bool TestConv2DNHWC()
|
||||
472.5,
|
||||
490.5,
|
||||
508.5};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
|
||||
params.N = 1;
|
||||
params.K = 2;
|
||||
@@ -163,7 +162,7 @@ bool TestConv2DNHWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{1, 1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1, 1};
|
||||
|
||||
out_tensor = RunReferenceConv<2>(params);
|
||||
out_tensor = run_reference_convolution_forward<2>(params);
|
||||
ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
|
||||
ref_data = std::vector<float>{
|
||||
210., 210., 327., 327., 351., 351., 375., 375., 399., 399.,
|
||||
@@ -171,18 +170,18 @@ bool TestConv2DNHWC()
|
||||
747., 747., 1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
|
||||
1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
|
||||
1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool TestConv1DNWC()
|
||||
bool test_conv1d_nwc()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 1;
|
||||
params.N = 1;
|
||||
params.K = 1;
|
||||
@@ -194,19 +193,20 @@ bool TestConv1DNWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{0};
|
||||
params.input_right_pads = std::vector<ck::index_t>{0};
|
||||
|
||||
auto out_tensor = RunReferenceConv<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
auto out_tensor =
|
||||
run_reference_convolution_forward<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
std::vector<std::size_t> ref_dims{1, 1, 4};
|
||||
std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
|
||||
params.num_dim_spatial = 1;
|
||||
params.N = 1;
|
||||
@@ -219,19 +219,19 @@ bool TestConv1DNWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
|
||||
out_tensor = RunReferenceConv<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
out_tensor = run_reference_convolution_forward<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(params);
|
||||
ref_dims = std::vector<std::size_t>{1, 2, 5};
|
||||
ref_data = std::vector<float>{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
|
||||
|
||||
params.num_dim_spatial = 1;
|
||||
params.N = 2;
|
||||
@@ -244,13 +244,13 @@ bool TestConv1DNWC()
|
||||
params.input_left_pads = std::vector<ck::index_t>{1};
|
||||
params.input_right_pads = std::vector<ck::index_t>{1};
|
||||
|
||||
auto out_tensor2 = RunReferenceConv<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(
|
||||
auto out_tensor2 = run_reference_convolution_forward<1,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NWC,
|
||||
ck::tensor_layout::convolution::KXC,
|
||||
ck::tensor_layout::convolution::NWK>(
|
||||
params, FillMonotonicSeq<float>{0.f, 0.1f});
|
||||
|
||||
ref_dims = std::vector<std::size_t>{2, 16, 16};
|
||||
@@ -319,18 +319,18 @@ bool TestConv1DNWC()
|
||||
72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9,
|
||||
49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4,
|
||||
49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4};
|
||||
res = res && test::check_err(out_tensor2.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor2.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool TestConv3DNCDHW()
|
||||
bool test_conv3d_ncdhw()
|
||||
{
|
||||
bool res{true};
|
||||
ck::conv_util::ConvParams params;
|
||||
ck::utils::conv::ConvParams params;
|
||||
params.num_dim_spatial = 3;
|
||||
params.N = 1;
|
||||
params.K = 1;
|
||||
@@ -342,13 +342,13 @@ bool TestConv3DNCDHW()
|
||||
params.input_left_pads = std::vector<ck::index_t>{0, 0, 0};
|
||||
params.input_right_pads = std::vector<ck::index_t>{0, 0, 0};
|
||||
|
||||
auto out_tensor = RunReferenceConv<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NCDHW,
|
||||
ck::tensor_layout::convolution::KCZYX,
|
||||
ck::tensor_layout::convolution::NKDHW>(
|
||||
auto out_tensor = run_reference_convolution_forward<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NCDHW,
|
||||
ck::tensor_layout::convolution::KCZYX,
|
||||
ck::tensor_layout::convolution::NKDHW>(
|
||||
params, FillMonotonicSeq<float>{0.f, 0.1f});
|
||||
std::vector<std::size_t> ref_dims{1, 1, 4, 4, 4};
|
||||
std::vector<float> ref_data{
|
||||
@@ -360,10 +360,11 @@ bool TestConv3DNCDHW()
|
||||
634.5, 637.2, 639.9, 642.60004, 650.7, 653.4, 656.10004, 658.8,
|
||||
699.3, 702., 704.7, 707.4, 715.5, 718.2, 720.9, 723.60004,
|
||||
731.7, 734.4001, 737.10004, 739.8, 747.9001, 750.60004, 753.3, 756.};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error [case 1]: wrong output tensor dimensions!");
|
||||
res = res && test::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error [case 1]: wrong output tensor dimensions!");
|
||||
res = res &&
|
||||
ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!");
|
||||
|
||||
params.N = 1;
|
||||
params.K = 2;
|
||||
@@ -375,13 +376,13 @@ bool TestConv3DNCDHW()
|
||||
params.input_left_pads = std::vector<ck::index_t>{0, 0, 0};
|
||||
params.input_right_pads = std::vector<ck::index_t>{0, 0, 0};
|
||||
|
||||
out_tensor = RunReferenceConv<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NCDHW,
|
||||
ck::tensor_layout::convolution::KCZYX,
|
||||
ck::tensor_layout::convolution::NKDHW>(
|
||||
out_tensor = run_reference_convolution_forward<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
ck::tensor_layout::convolution::NCDHW,
|
||||
ck::tensor_layout::convolution::KCZYX,
|
||||
ck::tensor_layout::convolution::NKDHW>(
|
||||
params, FillMonotonicSeq<float>{0.f, 0.1f});
|
||||
ref_dims = std::vector<std::size_t>{1, 2, 4, 4, 4};
|
||||
ref_data = std::vector<float>{
|
||||
@@ -401,11 +402,11 @@ bool TestConv3DNCDHW()
|
||||
5283.9004, 5292., 5300.0996, 5308.2, 5381.0996, 5389.2, 5397.3, 5405.4004,
|
||||
6255.9004, 6264.0005, 6272.1, 6280.2, 6353.1, 6361.2, 6369.301, 6377.4,
|
||||
6450.301, 6458.4, 6466.5, 6474.6, 6547.5, 6555.6, 6563.699, 6571.801};
|
||||
res = res && test::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error [case 2]: wrong output tensor dimensions!");
|
||||
res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
|
||||
ref_dims,
|
||||
"Error [case 2]: wrong output tensor dimensions!");
|
||||
res =
|
||||
res && test::check_err(
|
||||
res && ck::utils::check_err(
|
||||
out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f);
|
||||
|
||||
return res;
|
||||
@@ -416,11 +417,11 @@ bool TestConv3DNCDHW()
|
||||
int main(void)
|
||||
{
|
||||
bool res{true};
|
||||
res = TestConv2DNHWC();
|
||||
std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv1DNWC();
|
||||
res = test_conv2d_nhwc();
|
||||
std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv1d_nwc();
|
||||
std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = TestConv3DNCDHW();
|
||||
std::cout << "TestConv3DNCDHW ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
res = test_conv3d_ncdhw();
|
||||
std::cout << "test_conv3d_ncdhw ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return res ? 0 : 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user